add, lib: debug output, refactor similar transactions lookup
add --debug=1 shows the top hits for similar past transactions. added: Hledger.Cli.Utils.journalSimilarTransaction provides --debug=1 output changed: Hledger.Cli.Commands.Add.transactionsSimilarTo -> Hledger.Data.Journal.journalTransactionsSimilarTo now takes an extra number-of-results argument
This commit is contained in:
parent
7979c7d74c
commit
44508f58c8
@ -67,6 +67,7 @@ module Hledger.Data.Journal (
|
||||
journalNextTransaction,
|
||||
journalPrevTransaction,
|
||||
journalPostings,
|
||||
journalTransactionsSimilarTo,
|
||||
-- journalPrices,
|
||||
-- * Standard account types
|
||||
journalBalanceSheetAccountQuery,
|
||||
@ -96,6 +97,7 @@ import "extra" Control.Monad.Extra (whenM)
|
||||
import Control.Monad.Reader as R
|
||||
import Control.Monad.ST (ST, runST)
|
||||
import Data.Array.ST (STArray, getElems, newListArray, writeArray)
|
||||
import Data.Char (toUpper)
|
||||
import Data.Default (Default(..))
|
||||
import Data.Function ((&))
|
||||
import qualified Data.HashTable.Class as H (toList)
|
||||
@ -125,6 +127,7 @@ import Hledger.Data.Transaction
|
||||
import Hledger.Data.TransactionModifier
|
||||
import Hledger.Data.Posting
|
||||
import Hledger.Query
|
||||
import Data.List (sortBy)
|
||||
|
||||
|
||||
-- try to make Journal ppShow-compatible
|
||||
@ -321,6 +324,55 @@ journalAccountNames = journalAccountNamesDeclaredOrImplied
|
||||
journalAccountNameTree :: Journal -> Tree AccountName
|
||||
journalAccountNameTree = accountNameTreeFrom . journalAccountNames
|
||||
|
||||
-- | Find up to N most similar and most recent transactions matching
|
||||
-- the given transaction description and query. Transactions are
|
||||
-- listed with their description's similarity score (see
|
||||
-- compareDescriptions), sorted by highest score and then by date.
|
||||
-- Only transactions with a similarity score greater than the minimum
|
||||
-- threshold (currently 0) are returned.
|
||||
journalTransactionsSimilarTo :: Journal -> Query -> Text -> Int -> [(Double,Transaction)]
|
||||
journalTransactionsSimilarTo j q desc n =
|
||||
take n $
|
||||
sortBy (\(s1,t1) (s2,t2) -> compare (s2,tdate t2) (s1,tdate t1)) $
|
||||
filter ((> threshold).fst)
|
||||
[(compareDescriptions desc $ tdescription t, t) | t <- ts]
|
||||
where
|
||||
ts = filter (q `matchesTransaction`) $ jtxns j
|
||||
threshold = 0
|
||||
|
||||
-- | Return a similarity measure, from 0 to 1, for two transaction
|
||||
-- descriptions. This is like compareStrings, but first strips out
|
||||
-- any numbers, to improve accuracy eg when there are bank transaction
|
||||
-- ids from imported data.
|
||||
compareDescriptions :: Text -> Text -> Double
|
||||
compareDescriptions s t = compareStrings s' t'
|
||||
where s' = simplify $ T.unpack s
|
||||
t' = simplify $ T.unpack t
|
||||
simplify = filter (not . (`elem` ("0123456789" :: String)))
|
||||
|
||||
-- | Return a similarity measure, from 0 to 1, for two strings. This
|
||||
-- was based on Simon White's string similarity algorithm
|
||||
-- (http://www.catalysoft.com/articles/StrikeAMatch.html), later found
|
||||
-- to be https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient,
|
||||
-- modified to handle short strings better.
|
||||
-- Todo: check out http://nlp.fi.muni.cz/raslan/2008/raslan08.pdf#page=14 .
|
||||
compareStrings :: String -> String -> Double
|
||||
compareStrings "" "" = 1
|
||||
compareStrings [_] "" = 0
|
||||
compareStrings "" [_] = 0
|
||||
compareStrings [a] [b] = if toUpper a == toUpper b then 1 else 0
|
||||
compareStrings s1 s2 = 2 * commonpairs / totalpairs
|
||||
where
|
||||
pairs1 = S.fromList $ wordLetterPairs $ uppercase s1
|
||||
pairs2 = S.fromList $ wordLetterPairs $ uppercase s2
|
||||
commonpairs = fromIntegral $ S.size $ S.intersection pairs1 pairs2
|
||||
totalpairs = fromIntegral $ S.size pairs1 + S.size pairs2
|
||||
|
||||
wordLetterPairs = concatMap letterPairs . words
|
||||
|
||||
letterPairs (a:b:rest) = [a,b] : letterPairs (b:rest)
|
||||
letterPairs _ = []
|
||||
|
||||
-- queries for standard account types
|
||||
|
||||
-- | Get a query for accounts of the specified types in this journal.
|
||||
|
||||
@ -11,7 +11,6 @@ module Hledger.Cli.Commands.Add (
|
||||
,add
|
||||
,appendToJournalFileOrStdout
|
||||
,journalAddTransaction
|
||||
,transactionsSimilarTo
|
||||
)
|
||||
where
|
||||
|
||||
@ -26,7 +25,6 @@ import Data.Char (toUpper, toLower)
|
||||
import Data.Either (isRight)
|
||||
import Data.Functor.Identity (Identity(..))
|
||||
import "base-compat-batteries" Data.List.Compat
|
||||
import qualified Data.Set as S
|
||||
import Data.Maybe (fromJust, fromMaybe, isJust)
|
||||
import Data.Text (Text)
|
||||
import qualified Data.Text as T
|
||||
@ -49,6 +47,7 @@ import Text.Printf
|
||||
import Hledger
|
||||
import Hledger.Cli.CliOptions
|
||||
import Hledger.Cli.Commands.Register (postingsReportAsText)
|
||||
import Hledger.Cli.Utils (journalSimilarTransaction)
|
||||
|
||||
|
||||
addmode = hledgerCommandMode
|
||||
@ -176,7 +175,7 @@ confirmedTransactionWizard prevInput es@EntryState{..} stack@(currentStage : _)
|
||||
|
||||
EnterDescAndComment (date, code) -> descriptionAndCommentWizard prevInput es >>= \case
|
||||
Just (desc, comment) -> do
|
||||
let mbaset = similarTransaction es desc
|
||||
let mbaset = journalSimilarTransaction esOpts esJournal desc
|
||||
es' = es
|
||||
{ esArgs = drop 1 esArgs
|
||||
, esPostings = []
|
||||
@ -258,15 +257,6 @@ confirmedTransactionWizard prevInput es@EntryState{..} stack@(currentStage : _)
|
||||
where
|
||||
replaceNthOrAppend n newElem xs = take n xs ++ [newElem] ++ drop (n + 1) xs
|
||||
|
||||
-- Identify the closest recent match for this description in past transactions.
|
||||
similarTransaction :: EntryState -> Text -> Maybe Transaction
|
||||
similarTransaction EntryState{..} desc =
|
||||
let q = queryFromFlags . rsOpts $ reportspec_ esOpts
|
||||
historymatches = transactionsSimilarTo esJournal q desc
|
||||
bestmatch | null historymatches = Nothing
|
||||
| otherwise = Just $ snd $ head historymatches
|
||||
in bestmatch
|
||||
|
||||
dateAndCodeWizard PrevInput{..} EntryState{..} = do
|
||||
let def = headDef (T.unpack $ showDate esDefDate) esArgs
|
||||
retryMsg "A valid hledger smart date is required. Eg: 2014/2/14, 14, yesterday." $
|
||||
@ -482,49 +472,3 @@ registerFromString s = do
|
||||
capitalize :: String -> String
|
||||
capitalize "" = ""
|
||||
capitalize (c:cs) = toUpper c : cs
|
||||
|
||||
-- | Find the most similar and recent transactions matching the given
|
||||
-- transaction description and report query. Transactions are listed
|
||||
-- with their "relevancy" score, most relevant first.
|
||||
transactionsSimilarTo :: Journal -> Query -> Text -> [(Double,Transaction)]
|
||||
transactionsSimilarTo j q desc =
|
||||
sortBy compareRelevanceAndRecency
|
||||
$ filter ((> threshold).fst)
|
||||
[(compareDescriptions desc $ tdescription t, t) | t <- ts]
|
||||
where
|
||||
compareRelevanceAndRecency (n1,t1) (n2,t2) = compare (n2,tdate t2) (n1,tdate t1)
|
||||
ts = filter (q `matchesTransaction`) $ jtxns j
|
||||
threshold = 0
|
||||
|
||||
-- | Return a similarity measure, from 0 to 1, for two transaction
|
||||
-- descriptions. This is like compareStrings, but first strips out
|
||||
-- any numbers, to improve accuracy eg when there are bank transaction
|
||||
-- ids from imported data.
|
||||
compareDescriptions :: Text -> Text -> Double
|
||||
compareDescriptions s t = compareStrings s' t'
|
||||
where s' = simplify $ T.unpack s
|
||||
t' = simplify $ T.unpack t
|
||||
simplify = filter (not . (`elem` ("0123456789" :: String)))
|
||||
|
||||
-- | Return a similarity measure, from 0 to 1, for two strings. This
|
||||
-- was based on Simon White's string similarity algorithm
|
||||
-- (http://www.catalysoft.com/articles/StrikeAMatch.html), later found
|
||||
-- to be https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient,
|
||||
-- modified to handle short strings better.
|
||||
-- Todo: check out http://nlp.fi.muni.cz/raslan/2008/raslan08.pdf#page=14 .
|
||||
compareStrings :: String -> String -> Double
|
||||
compareStrings "" "" = 1
|
||||
compareStrings [_] "" = 0
|
||||
compareStrings "" [_] = 0
|
||||
compareStrings [a] [b] = if toUpper a == toUpper b then 1 else 0
|
||||
compareStrings s1 s2 = 2 * commonpairs / totalpairs
|
||||
where
|
||||
pairs1 = S.fromList $ wordLetterPairs $ uppercase s1
|
||||
pairs2 = S.fromList $ wordLetterPairs $ uppercase s2
|
||||
commonpairs = fromIntegral $ S.size $ S.intersection pairs1 pairs2
|
||||
totalpairs = fromIntegral $ S.size pairs1 + S.size pairs2
|
||||
|
||||
wordLetterPairs = concatMap letterPairs . words
|
||||
|
||||
letterPairs (a:b:rest) = [a,b] : letterPairs (b:rest)
|
||||
letterPairs _ = []
|
||||
|
||||
@ -32,7 +32,6 @@ import Hledger.Read.CsvReader (CSV, printCSV)
|
||||
import Hledger
|
||||
import Hledger.Cli.CliOptions
|
||||
import Hledger.Cli.Utils
|
||||
import Hledger.Cli.Commands.Add ( transactionsSimilarTo )
|
||||
|
||||
|
||||
printmode = hledgerCommandMode
|
||||
@ -194,16 +193,7 @@ postingToCSV p =
|
||||
-- | Print the transaction most closely and recently matching a description
|
||||
-- (and the query, if any).
|
||||
printMatch :: CliOpts -> Journal -> Text -> IO ()
|
||||
printMatch CliOpts{reportspec_=rspec} j desc = do
|
||||
case similarTransaction' j (rsQuery rspec) desc of
|
||||
printMatch opts j desc = do
|
||||
case journalSimilarTransaction opts j desc of
|
||||
Nothing -> putStrLn "no matches found."
|
||||
Just t -> T.putStr $ showTransaction t
|
||||
|
||||
where
|
||||
-- Identify the closest recent match for this description in past transactions.
|
||||
similarTransaction' :: Journal -> Query -> Text -> Maybe Transaction
|
||||
similarTransaction' j q desc
|
||||
| null historymatches = Nothing
|
||||
| otherwise = Just $ snd $ head historymatches
|
||||
where
|
||||
historymatches = transactionsSimilarTo j q desc
|
||||
|
||||
@ -26,6 +26,7 @@ module Hledger.Cli.Utils
|
||||
pivotByOpts,
|
||||
anonymiseByOpts,
|
||||
utcTimeToClockTime,
|
||||
journalSimilarTransaction,
|
||||
tests_Cli_Utils,
|
||||
)
|
||||
where
|
||||
@ -38,7 +39,7 @@ import qualified Data.Text.IO as T
|
||||
import qualified Data.Text.Lazy as TL
|
||||
import qualified Data.Text.Lazy.IO as TL
|
||||
import Data.Time (UTCTime, Day, addDays)
|
||||
import Safe (readMay)
|
||||
import Safe (readMay, headMay)
|
||||
import System.Console.CmdArgs
|
||||
import System.Directory (getModificationTime, getDirectoryContents, copyFile, doesFileExist)
|
||||
import System.Exit
|
||||
@ -302,6 +303,17 @@ backupNumber f g = case g =~ ("^" ++ f ++ "\\.([0-9]+)$") of
|
||||
(_::FilePath, _::FilePath, _::FilePath, [ext::FilePath]) -> readMay ext
|
||||
_ -> Nothing
|
||||
|
||||
-- Identify the closest recent match for this description in past transactions.
|
||||
-- If the options specify a query, only matched transactions are considered.
|
||||
journalSimilarTransaction :: CliOpts -> Journal -> T.Text -> Maybe Transaction
|
||||
journalSimilarTransaction cliopts j desc = mbestmatch
|
||||
where
|
||||
mbestmatch = snd <$> headMay bestmatches
|
||||
bestmatches =
|
||||
dbg1With (unlines . ("similar transactions:":) . map (\(score,Transaction{..}) -> printf "%0.3f %s %s" score (show tdate) tdescription)) $
|
||||
journalTransactionsSimilarTo j q desc 10
|
||||
q = queryFromFlags $ rsOpts $ reportspec_ cliopts
|
||||
|
||||
tests_Cli_Utils = tests "Utils" [
|
||||
|
||||
-- tests "journalApplyValue" [
|
||||
|
||||
Loading…
Reference in New Issue
Block a user