add, lib: debug output, refactor similar transactions lookup

add --debug=1 shows the top hits for similar past transactions.

added:
Hledger.Cli.Utils.journalSimilarTransaction
 provides --debug=1 output

changed:
Hledger.Cli.Commands.Add.transactionsSimilarTo -> Hledger.Data.Journal.journalTransactionsSimilarTo
 now takes an extra number-of-results argument
This commit is contained in:
Simon Michael 2021-02-20 12:40:47 -08:00
parent 7979c7d74c
commit 44508f58c8
4 changed files with 71 additions and 73 deletions

View File

@ -67,6 +67,7 @@ module Hledger.Data.Journal (
journalNextTransaction,
journalPrevTransaction,
journalPostings,
journalTransactionsSimilarTo,
-- journalPrices,
-- * Standard account types
journalBalanceSheetAccountQuery,
@ -96,6 +97,7 @@ import "extra" Control.Monad.Extra (whenM)
import Control.Monad.Reader as R
import Control.Monad.ST (ST, runST)
import Data.Array.ST (STArray, getElems, newListArray, writeArray)
import Data.Char (toUpper)
import Data.Default (Default(..))
import Data.Function ((&))
import qualified Data.HashTable.Class as H (toList)
@ -125,6 +127,7 @@ import Hledger.Data.Transaction
import Hledger.Data.TransactionModifier
import Hledger.Data.Posting
import Hledger.Query
import Data.List (sortBy)
-- try to make Journal ppShow-compatible
@ -321,6 +324,55 @@ journalAccountNames = journalAccountNamesDeclaredOrImplied
journalAccountNameTree :: Journal -> Tree AccountName
journalAccountNameTree = accountNameTreeFrom . journalAccountNames
-- | Find up to N most similar and most recent transactions matching
-- the given transaction description and query. Transactions are
-- listed with their description's similarity score (see
-- compareDescriptions), sorted by highest score and then by date.
-- Only transactions with a similarity score greater than the minimum
-- threshold (currently 0) are returned.
journalTransactionsSimilarTo :: Journal -> Query -> Text -> Int -> [(Double,Transaction)]
journalTransactionsSimilarTo j q desc n =
take n $
sortBy (\(s1,t1) (s2,t2) -> compare (s2,tdate t2) (s1,tdate t1)) $
filter ((> threshold).fst)
[(compareDescriptions desc $ tdescription t, t) | t <- ts]
where
ts = filter (q `matchesTransaction`) $ jtxns j
threshold = 0
-- | Return a similarity measure, from 0 to 1, for two transaction
-- descriptions. This is like compareStrings, but first strips out
-- any numbers, to improve accuracy eg when there are bank transaction
-- ids from imported data.
compareDescriptions :: Text -> Text -> Double
compareDescriptions s t = compareStrings s' t'
where s' = simplify $ T.unpack s
t' = simplify $ T.unpack t
simplify = filter (not . (`elem` ("0123456789" :: String)))
-- | Return a similarity measure, from 0 to 1, for two strings. This
-- was based on Simon White's string similarity algorithm
-- (http://www.catalysoft.com/articles/StrikeAMatch.html), later found
-- to be https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient,
-- modified to handle short strings better.
-- Todo: check out http://nlp.fi.muni.cz/raslan/2008/raslan08.pdf#page=14 .
compareStrings :: String -> String -> Double
compareStrings "" "" = 1
compareStrings [_] "" = 0
compareStrings "" [_] = 0
compareStrings [a] [b] = if toUpper a == toUpper b then 1 else 0
compareStrings s1 s2 = 2 * commonpairs / totalpairs
where
pairs1 = S.fromList $ wordLetterPairs $ uppercase s1
pairs2 = S.fromList $ wordLetterPairs $ uppercase s2
commonpairs = fromIntegral $ S.size $ S.intersection pairs1 pairs2
totalpairs = fromIntegral $ S.size pairs1 + S.size pairs2
wordLetterPairs = concatMap letterPairs . words
letterPairs (a:b:rest) = [a,b] : letterPairs (b:rest)
letterPairs _ = []
-- queries for standard account types
-- | Get a query for accounts of the specified types in this journal.

View File

@ -11,7 +11,6 @@ module Hledger.Cli.Commands.Add (
,add
,appendToJournalFileOrStdout
,journalAddTransaction
,transactionsSimilarTo
)
where
@ -26,7 +25,6 @@ import Data.Char (toUpper, toLower)
import Data.Either (isRight)
import Data.Functor.Identity (Identity(..))
import "base-compat-batteries" Data.List.Compat
import qualified Data.Set as S
import Data.Maybe (fromJust, fromMaybe, isJust)
import Data.Text (Text)
import qualified Data.Text as T
@ -49,6 +47,7 @@ import Text.Printf
import Hledger
import Hledger.Cli.CliOptions
import Hledger.Cli.Commands.Register (postingsReportAsText)
import Hledger.Cli.Utils (journalSimilarTransaction)
addmode = hledgerCommandMode
@ -176,7 +175,7 @@ confirmedTransactionWizard prevInput es@EntryState{..} stack@(currentStage : _)
EnterDescAndComment (date, code) -> descriptionAndCommentWizard prevInput es >>= \case
Just (desc, comment) -> do
let mbaset = similarTransaction es desc
let mbaset = journalSimilarTransaction esOpts esJournal desc
es' = es
{ esArgs = drop 1 esArgs
, esPostings = []
@ -258,15 +257,6 @@ confirmedTransactionWizard prevInput es@EntryState{..} stack@(currentStage : _)
where
replaceNthOrAppend n newElem xs = take n xs ++ [newElem] ++ drop (n + 1) xs
-- Identify the closest recent match for this description in past transactions.
similarTransaction :: EntryState -> Text -> Maybe Transaction
similarTransaction EntryState{..} desc =
let q = queryFromFlags . rsOpts $ reportspec_ esOpts
historymatches = transactionsSimilarTo esJournal q desc
bestmatch | null historymatches = Nothing
| otherwise = Just $ snd $ head historymatches
in bestmatch
dateAndCodeWizard PrevInput{..} EntryState{..} = do
let def = headDef (T.unpack $ showDate esDefDate) esArgs
retryMsg "A valid hledger smart date is required. Eg: 2014/2/14, 14, yesterday." $
@ -482,49 +472,3 @@ registerFromString s = do
capitalize :: String -> String
capitalize "" = ""
capitalize (c:cs) = toUpper c : cs
-- | Find the most similar and recent transactions matching the given
-- transaction description and report query. Transactions are listed
-- with their "relevancy" score, most relevant first.
transactionsSimilarTo :: Journal -> Query -> Text -> [(Double,Transaction)]
transactionsSimilarTo j q desc =
sortBy compareRelevanceAndRecency
$ filter ((> threshold).fst)
[(compareDescriptions desc $ tdescription t, t) | t <- ts]
where
compareRelevanceAndRecency (n1,t1) (n2,t2) = compare (n2,tdate t2) (n1,tdate t1)
ts = filter (q `matchesTransaction`) $ jtxns j
threshold = 0
-- | Return a similarity measure, from 0 to 1, for two transaction
-- descriptions. This is like compareStrings, but first strips out
-- any numbers, to improve accuracy eg when there are bank transaction
-- ids from imported data.
compareDescriptions :: Text -> Text -> Double
compareDescriptions s t = compareStrings s' t'
where s' = simplify $ T.unpack s
t' = simplify $ T.unpack t
simplify = filter (not . (`elem` ("0123456789" :: String)))
-- | Return a similarity measure, from 0 to 1, for two strings. This
-- was based on Simon White's string similarity algorithm
-- (http://www.catalysoft.com/articles/StrikeAMatch.html), later found
-- to be https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient,
-- modified to handle short strings better.
-- Todo: check out http://nlp.fi.muni.cz/raslan/2008/raslan08.pdf#page=14 .
compareStrings :: String -> String -> Double
compareStrings "" "" = 1
compareStrings [_] "" = 0
compareStrings "" [_] = 0
compareStrings [a] [b] = if toUpper a == toUpper b then 1 else 0
compareStrings s1 s2 = 2 * commonpairs / totalpairs
where
pairs1 = S.fromList $ wordLetterPairs $ uppercase s1
pairs2 = S.fromList $ wordLetterPairs $ uppercase s2
commonpairs = fromIntegral $ S.size $ S.intersection pairs1 pairs2
totalpairs = fromIntegral $ S.size pairs1 + S.size pairs2
wordLetterPairs = concatMap letterPairs . words
letterPairs (a:b:rest) = [a,b] : letterPairs (b:rest)
letterPairs _ = []

View File

@ -32,7 +32,6 @@ import Hledger.Read.CsvReader (CSV, printCSV)
import Hledger
import Hledger.Cli.CliOptions
import Hledger.Cli.Utils
import Hledger.Cli.Commands.Add ( transactionsSimilarTo )
printmode = hledgerCommandMode
@ -194,16 +193,7 @@ postingToCSV p =
-- | Print the transaction most closely and recently matching a description
-- (and the query, if any).
printMatch :: CliOpts -> Journal -> Text -> IO ()
printMatch CliOpts{reportspec_=rspec} j desc = do
case similarTransaction' j (rsQuery rspec) desc of
Nothing -> putStrLn "no matches found."
Just t -> T.putStr $ showTransaction t
where
-- Identify the closest recent match for this description in past transactions.
similarTransaction' :: Journal -> Query -> Text -> Maybe Transaction
similarTransaction' j q desc
| null historymatches = Nothing
| otherwise = Just $ snd $ head historymatches
where
historymatches = transactionsSimilarTo j q desc
printMatch opts j desc = do
case journalSimilarTransaction opts j desc of
Nothing -> putStrLn "no matches found."
Just t -> T.putStr $ showTransaction t

View File

@ -26,6 +26,7 @@ module Hledger.Cli.Utils
pivotByOpts,
anonymiseByOpts,
utcTimeToClockTime,
journalSimilarTransaction,
tests_Cli_Utils,
)
where
@ -38,7 +39,7 @@ import qualified Data.Text.IO as T
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.IO as TL
import Data.Time (UTCTime, Day, addDays)
import Safe (readMay)
import Safe (readMay, headMay)
import System.Console.CmdArgs
import System.Directory (getModificationTime, getDirectoryContents, copyFile, doesFileExist)
import System.Exit
@ -302,6 +303,17 @@ backupNumber f g = case g =~ ("^" ++ f ++ "\\.([0-9]+)$") of
(_::FilePath, _::FilePath, _::FilePath, [ext::FilePath]) -> readMay ext
_ -> Nothing
-- Identify the closest recent match for this description in past transactions.
-- If the options specify a query, only matched transactions are considered.
journalSimilarTransaction :: CliOpts -> Journal -> T.Text -> Maybe Transaction
journalSimilarTransaction cliopts j desc = mbestmatch
where
mbestmatch = snd <$> headMay bestmatches
bestmatches =
dbg1With (unlines . ("similar transactions:":) . map (\(score,Transaction{..}) -> printf "%0.3f %s %s" score (show tdate) tdescription)) $
journalTransactionsSimilarTo j q desc 10
q = queryFromFlags $ rsOpts $ reportspec_ cliopts
tests_Cli_Utils = tests "Utils" [
-- tests "journalApplyValue" [