add, lib: debug output, refactor similar transactions lookup

add --debug=1 shows the top hits for similar past transactions.

added:
Hledger.Cli.Utils.journalSimilarTransaction
 provides --debug=1 output

changed:
Hledger.Cli.Commands.Add.transactionsSimilarTo -> Hledger.Data.Journal.journalTransactionsSimilarTo
 now takes an extra number-of-results argument
This commit is contained in:
Simon Michael 2021-02-20 12:40:47 -08:00
parent 7979c7d74c
commit 44508f58c8
4 changed files with 71 additions and 73 deletions

View File

@ -67,6 +67,7 @@ module Hledger.Data.Journal (
journalNextTransaction, journalNextTransaction,
journalPrevTransaction, journalPrevTransaction,
journalPostings, journalPostings,
journalTransactionsSimilarTo,
-- journalPrices, -- journalPrices,
-- * Standard account types -- * Standard account types
journalBalanceSheetAccountQuery, journalBalanceSheetAccountQuery,
@ -96,6 +97,7 @@ import "extra" Control.Monad.Extra (whenM)
import Control.Monad.Reader as R import Control.Monad.Reader as R
import Control.Monad.ST (ST, runST) import Control.Monad.ST (ST, runST)
import Data.Array.ST (STArray, getElems, newListArray, writeArray) import Data.Array.ST (STArray, getElems, newListArray, writeArray)
import Data.Char (toUpper)
import Data.Default (Default(..)) import Data.Default (Default(..))
import Data.Function ((&)) import Data.Function ((&))
import qualified Data.HashTable.Class as H (toList) import qualified Data.HashTable.Class as H (toList)
@ -125,6 +127,7 @@ import Hledger.Data.Transaction
import Hledger.Data.TransactionModifier import Hledger.Data.TransactionModifier
import Hledger.Data.Posting import Hledger.Data.Posting
import Hledger.Query import Hledger.Query
import Data.List (sortBy)
-- try to make Journal ppShow-compatible -- try to make Journal ppShow-compatible
@ -321,6 +324,55 @@ journalAccountNames = journalAccountNamesDeclaredOrImplied
journalAccountNameTree :: Journal -> Tree AccountName journalAccountNameTree :: Journal -> Tree AccountName
journalAccountNameTree = accountNameTreeFrom . journalAccountNames journalAccountNameTree = accountNameTreeFrom . journalAccountNames
-- | Find up to N most similar and most recent transactions matching
-- the given transaction description and query. Transactions are
-- listed with their description's similarity score (see
-- compareDescriptions), sorted by highest score and then by date.
-- Only transactions with a similarity score greater than the minimum
-- threshold (currently 0) are returned.
journalTransactionsSimilarTo :: Journal -> Query -> Text -> Int -> [(Double,Transaction)]
journalTransactionsSimilarTo j q desc n =
take n $
sortBy (\(s1,t1) (s2,t2) -> compare (s2,tdate t2) (s1,tdate t1)) $
filter ((> threshold).fst)
[(compareDescriptions desc $ tdescription t, t) | t <- ts]
where
ts = filter (q `matchesTransaction`) $ jtxns j
threshold = 0
-- | Return a similarity measure, from 0 to 1, for two transaction
-- descriptions. This is like compareStrings, but first strips out
-- any numbers, to improve accuracy eg when there are bank transaction
-- ids from imported data.
compareDescriptions :: Text -> Text -> Double
compareDescriptions s t = compareStrings s' t'
where s' = simplify $ T.unpack s
t' = simplify $ T.unpack t
simplify = filter (not . (`elem` ("0123456789" :: String)))
-- | Return a similarity measure, from 0 to 1, for two strings. This
-- was based on Simon White's string similarity algorithm
-- (http://www.catalysoft.com/articles/StrikeAMatch.html), later found
-- to be https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient,
-- modified to handle short strings better.
-- Todo: check out http://nlp.fi.muni.cz/raslan/2008/raslan08.pdf#page=14 .
compareStrings :: String -> String -> Double
compareStrings "" "" = 1
compareStrings [_] "" = 0
compareStrings "" [_] = 0
compareStrings [a] [b] = if toUpper a == toUpper b then 1 else 0
compareStrings s1 s2 = 2 * commonpairs / totalpairs
where
pairs1 = S.fromList $ wordLetterPairs $ uppercase s1
pairs2 = S.fromList $ wordLetterPairs $ uppercase s2
commonpairs = fromIntegral $ S.size $ S.intersection pairs1 pairs2
totalpairs = fromIntegral $ S.size pairs1 + S.size pairs2
wordLetterPairs = concatMap letterPairs . words
letterPairs (a:b:rest) = [a,b] : letterPairs (b:rest)
letterPairs _ = []
-- queries for standard account types -- queries for standard account types
-- | Get a query for accounts of the specified types in this journal. -- | Get a query for accounts of the specified types in this journal.

View File

@ -11,7 +11,6 @@ module Hledger.Cli.Commands.Add (
,add ,add
,appendToJournalFileOrStdout ,appendToJournalFileOrStdout
,journalAddTransaction ,journalAddTransaction
,transactionsSimilarTo
) )
where where
@ -26,7 +25,6 @@ import Data.Char (toUpper, toLower)
import Data.Either (isRight) import Data.Either (isRight)
import Data.Functor.Identity (Identity(..)) import Data.Functor.Identity (Identity(..))
import "base-compat-batteries" Data.List.Compat import "base-compat-batteries" Data.List.Compat
import qualified Data.Set as S
import Data.Maybe (fromJust, fromMaybe, isJust) import Data.Maybe (fromJust, fromMaybe, isJust)
import Data.Text (Text) import Data.Text (Text)
import qualified Data.Text as T import qualified Data.Text as T
@ -49,6 +47,7 @@ import Text.Printf
import Hledger import Hledger
import Hledger.Cli.CliOptions import Hledger.Cli.CliOptions
import Hledger.Cli.Commands.Register (postingsReportAsText) import Hledger.Cli.Commands.Register (postingsReportAsText)
import Hledger.Cli.Utils (journalSimilarTransaction)
addmode = hledgerCommandMode addmode = hledgerCommandMode
@ -176,7 +175,7 @@ confirmedTransactionWizard prevInput es@EntryState{..} stack@(currentStage : _)
EnterDescAndComment (date, code) -> descriptionAndCommentWizard prevInput es >>= \case EnterDescAndComment (date, code) -> descriptionAndCommentWizard prevInput es >>= \case
Just (desc, comment) -> do Just (desc, comment) -> do
let mbaset = similarTransaction es desc let mbaset = journalSimilarTransaction esOpts esJournal desc
es' = es es' = es
{ esArgs = drop 1 esArgs { esArgs = drop 1 esArgs
, esPostings = [] , esPostings = []
@ -258,15 +257,6 @@ confirmedTransactionWizard prevInput es@EntryState{..} stack@(currentStage : _)
where where
replaceNthOrAppend n newElem xs = take n xs ++ [newElem] ++ drop (n + 1) xs replaceNthOrAppend n newElem xs = take n xs ++ [newElem] ++ drop (n + 1) xs
-- Identify the closest recent match for this description in past transactions.
similarTransaction :: EntryState -> Text -> Maybe Transaction
similarTransaction EntryState{..} desc =
let q = queryFromFlags . rsOpts $ reportspec_ esOpts
historymatches = transactionsSimilarTo esJournal q desc
bestmatch | null historymatches = Nothing
| otherwise = Just $ snd $ head historymatches
in bestmatch
dateAndCodeWizard PrevInput{..} EntryState{..} = do dateAndCodeWizard PrevInput{..} EntryState{..} = do
let def = headDef (T.unpack $ showDate esDefDate) esArgs let def = headDef (T.unpack $ showDate esDefDate) esArgs
retryMsg "A valid hledger smart date is required. Eg: 2014/2/14, 14, yesterday." $ retryMsg "A valid hledger smart date is required. Eg: 2014/2/14, 14, yesterday." $
@ -482,49 +472,3 @@ registerFromString s = do
capitalize :: String -> String capitalize :: String -> String
capitalize "" = "" capitalize "" = ""
capitalize (c:cs) = toUpper c : cs capitalize (c:cs) = toUpper c : cs
-- | Find the most similar and recent transactions matching the given
-- transaction description and report query. Transactions are listed
-- with their "relevancy" score, most relevant first.
transactionsSimilarTo :: Journal -> Query -> Text -> [(Double,Transaction)]
transactionsSimilarTo j q desc =
sortBy compareRelevanceAndRecency
$ filter ((> threshold).fst)
[(compareDescriptions desc $ tdescription t, t) | t <- ts]
where
compareRelevanceAndRecency (n1,t1) (n2,t2) = compare (n2,tdate t2) (n1,tdate t1)
ts = filter (q `matchesTransaction`) $ jtxns j
threshold = 0
-- | Return a similarity measure, from 0 to 1, for two transaction
-- descriptions. This is like compareStrings, but first strips out
-- any numbers, to improve accuracy eg when there are bank transaction
-- ids from imported data.
compareDescriptions :: Text -> Text -> Double
compareDescriptions s t = compareStrings s' t'
where s' = simplify $ T.unpack s
t' = simplify $ T.unpack t
simplify = filter (not . (`elem` ("0123456789" :: String)))
-- | Return a similarity measure, from 0 to 1, for two strings. This
-- was based on Simon White's string similarity algorithm
-- (http://www.catalysoft.com/articles/StrikeAMatch.html), later found
-- to be https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient,
-- modified to handle short strings better.
-- Todo: check out http://nlp.fi.muni.cz/raslan/2008/raslan08.pdf#page=14 .
compareStrings :: String -> String -> Double
compareStrings "" "" = 1
compareStrings [_] "" = 0
compareStrings "" [_] = 0
compareStrings [a] [b] = if toUpper a == toUpper b then 1 else 0
compareStrings s1 s2 = 2 * commonpairs / totalpairs
where
pairs1 = S.fromList $ wordLetterPairs $ uppercase s1
pairs2 = S.fromList $ wordLetterPairs $ uppercase s2
commonpairs = fromIntegral $ S.size $ S.intersection pairs1 pairs2
totalpairs = fromIntegral $ S.size pairs1 + S.size pairs2
wordLetterPairs = concatMap letterPairs . words
letterPairs (a:b:rest) = [a,b] : letterPairs (b:rest)
letterPairs _ = []

View File

@ -32,7 +32,6 @@ import Hledger.Read.CsvReader (CSV, printCSV)
import Hledger import Hledger
import Hledger.Cli.CliOptions import Hledger.Cli.CliOptions
import Hledger.Cli.Utils import Hledger.Cli.Utils
import Hledger.Cli.Commands.Add ( transactionsSimilarTo )
printmode = hledgerCommandMode printmode = hledgerCommandMode
@ -194,16 +193,7 @@ postingToCSV p =
-- | Print the transaction most closely and recently matching a description -- | Print the transaction most closely and recently matching a description
-- (and the query, if any). -- (and the query, if any).
printMatch :: CliOpts -> Journal -> Text -> IO () printMatch :: CliOpts -> Journal -> Text -> IO ()
printMatch CliOpts{reportspec_=rspec} j desc = do printMatch opts j desc = do
case similarTransaction' j (rsQuery rspec) desc of case journalSimilarTransaction opts j desc of
Nothing -> putStrLn "no matches found." Nothing -> putStrLn "no matches found."
Just t -> T.putStr $ showTransaction t Just t -> T.putStr $ showTransaction t
where
-- Identify the closest recent match for this description in past transactions.
similarTransaction' :: Journal -> Query -> Text -> Maybe Transaction
similarTransaction' j q desc
| null historymatches = Nothing
| otherwise = Just $ snd $ head historymatches
where
historymatches = transactionsSimilarTo j q desc

View File

@ -26,6 +26,7 @@ module Hledger.Cli.Utils
pivotByOpts, pivotByOpts,
anonymiseByOpts, anonymiseByOpts,
utcTimeToClockTime, utcTimeToClockTime,
journalSimilarTransaction,
tests_Cli_Utils, tests_Cli_Utils,
) )
where where
@ -38,7 +39,7 @@ import qualified Data.Text.IO as T
import qualified Data.Text.Lazy as TL import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.IO as TL import qualified Data.Text.Lazy.IO as TL
import Data.Time (UTCTime, Day, addDays) import Data.Time (UTCTime, Day, addDays)
import Safe (readMay) import Safe (readMay, headMay)
import System.Console.CmdArgs import System.Console.CmdArgs
import System.Directory (getModificationTime, getDirectoryContents, copyFile, doesFileExist) import System.Directory (getModificationTime, getDirectoryContents, copyFile, doesFileExist)
import System.Exit import System.Exit
@ -302,6 +303,17 @@ backupNumber f g = case g =~ ("^" ++ f ++ "\\.([0-9]+)$") of
(_::FilePath, _::FilePath, _::FilePath, [ext::FilePath]) -> readMay ext (_::FilePath, _::FilePath, _::FilePath, [ext::FilePath]) -> readMay ext
_ -> Nothing _ -> Nothing
-- Identify the closest recent match for this description in past transactions.
-- If the options specify a query, only matched transactions are considered.
journalSimilarTransaction :: CliOpts -> Journal -> T.Text -> Maybe Transaction
journalSimilarTransaction cliopts j desc = mbestmatch
where
mbestmatch = snd <$> headMay bestmatches
bestmatches =
dbg1With (unlines . ("similar transactions:":) . map (\(score,Transaction{..}) -> printf "%0.3f %s %s" score (show tdate) tdescription)) $
journalTransactionsSimilarTo j q desc 10
q = queryFromFlags $ rsOpts $ reportspec_ cliopts
tests_Cli_Utils = tests "Utils" [ tests_Cli_Utils = tests "Utils" [
-- tests "journalApplyValue" [ -- tests "journalApplyValue" [