From c03d6b11234f579b77c87ecc7d4bbe8f8506c6f0 Mon Sep 17 00:00:00 2001 From: Simon Michael Date: Mon, 27 Mar 2023 15:13:49 -1000 Subject: [PATCH] imp: print: --match makes better choices Previously, similarity completely outweighed recency, so a slightly-more-similar transaction would always be selected no matter how old it was. Now similarity and recency are more balanced, and it should produce the desired transaction more often. There is also new debug output (at debug level 1) for troubleshooting. --- hledger-lib/Hledger/Data/Journal.hs | 46 +++++++++++++++++++++-------- hledger/Hledger/Cli/Utils.hs | 7 ++--- 2 files changed, 36 insertions(+), 17 deletions(-) diff --git a/hledger-lib/Hledger/Data/Journal.hs b/hledger-lib/Hledger/Data/Journal.hs index 92644db8f..39fed90fa 100644 --- a/hledger-lib/Hledger/Data/Journal.hs +++ b/hledger-lib/Hledger/Data/Journal.hs @@ -3,6 +3,7 @@ {-# LANGUAGE Rank2Types #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE TupleSections #-} +{-# LANGUAGE RecordWildCards #-} {-| @@ -122,8 +123,8 @@ import Data.Maybe (catMaybes, fromMaybe, mapMaybe, maybeToList) import qualified Data.Set as S import Data.Text (Text) import qualified Data.Text as T -import Safe (headMay, headDef, maximumMay, minimumMay) -import Data.Time.Calendar (Day, addDays, fromGregorian) +import Safe (headMay, headDef, maximumMay, minimumMay, lastDef) +import Data.Time.Calendar (Day, addDays, fromGregorian, diffDays) import Data.Time.Clock.POSIX (POSIXTime) import Data.Tree (Tree(..), flatten) import Text.Printf (printf) @@ -140,6 +141,9 @@ import Hledger.Data.TransactionModifier import Hledger.Data.Valuation import Hledger.Query import System.FilePath (takeFileName) +import Data.Ord (comparing) +import Hledger.Data.Dates (nulldate) +import Data.List (sort) -- | A parser of text that runs in some monad, keeping a Journal as state. @@ -429,20 +433,38 @@ journalInheritedAccountTags j a = as = a : parentAccountNames a -- PERF: cache in journal ? +type DateWeightedSimilarityScore = Double +type SimilarityScore = Double +type Age = Integer + -- | Find up to N most similar and most recent transactions matching --- the given transaction description and query. Transactions are --- listed with their description's similarity score (see --- compareDescriptions), sorted by highest score and then by date. --- Only transactions with a similarity score greater than a minimum --- threshold (currently 0) are returned. -journalTransactionsSimilarTo :: Journal -> Query -> Text -> Int -> [(Double,Transaction)] -journalTransactionsSimilarTo Journal{jtxns} q desc n = +-- the given transaction description and query and exceeding the given +-- description similarity score (0 to 1, see compareDescriptions). +-- Returns transactions along with +-- their age in days compared to the latest transaction date, +-- their description similarity score, +-- and a heuristically date-weighted variant of this that favours more recent transactions. +journalTransactionsSimilarTo :: Journal -> Text -> Query -> SimilarityScore -> Int + -> [(DateWeightedSimilarityScore, Age, SimilarityScore, Transaction)] +journalTransactionsSimilarTo Journal{jtxns} desc q similaritythreshold n = take n $ - sortBy (\(s1,t1) (s2,t2) -> compare (s2,tdate t2) (s1,tdate t1)) $ - filter ((> threshold).fst) + dbg1With ( + unlines . + ("up to 30 transactions above description similarity threshold "<>show similaritythreshold<>" ordered by recency-weighted similarity:":) . + take 30 . + map ( \(w,a,s,Transaction{..}) -> printf "weighted:%8.3f age:%4d similarity:%5.3f %s %s" w a s (show tdate) tdescription )) $ + sortBy (comparing (negate.first4)) $ + map (\(s,t) -> (weightedScore (s,t), age t, s, t)) $ + filter ((> similaritythreshold).fst) [(compareDescriptions desc $ tdescription t, t) | t <- jtxns, q `matchesTransaction` t] where - threshold = 0 + latest = lastDef nulldate $ sort $ map tdate jtxns + age = diffDays latest . tdate + -- Combine similarity and recency heuristically. This gave decent results + -- in my "find most recent invoice" use case in 2023-03, + -- but will probably need more attention. + weightedScore :: (Double, Transaction) -> Double + weightedScore (s, t) = 100 * s - fromIntegral (age t) / 4 -- | Return a similarity score from 0 to 1.5 for two transaction descriptions. -- This is based on compareStrings, with the following modifications: diff --git a/hledger/Hledger/Cli/Utils.hs b/hledger/Hledger/Cli/Utils.hs index 12d2799b9..d18882d81 100644 --- a/hledger/Hledger/Cli/Utils.hs +++ b/hledger/Hledger/Cli/Utils.hs @@ -251,12 +251,9 @@ backupNumber f g = case g =~ ("^" ++ f ++ "\\.([0-9]+)$") of -- Identify the closest recent match for this description in past transactions. -- If the options specify a query, only matched transactions are considered. journalSimilarTransaction :: CliOpts -> Journal -> T.Text -> Maybe Transaction -journalSimilarTransaction cliopts j desc = mbestmatch +journalSimilarTransaction cliopts j desc = + fmap fourth4 $ headMay $ journalTransactionsSimilarTo j desc q 0 1 where - mbestmatch = snd <$> headMay bestmatches - bestmatches = - dbg1With (unlines . ("similar transactions:":) . map (\(score,Transaction{..}) -> printf "%0.3f %s %s" score (show tdate) tdescription)) $ - journalTransactionsSimilarTo j q desc 10 q = queryFromFlags $ _rsReportOpts $ reportspec_ cliopts -- | Render a 'PostingsReport' or 'AccountTransactionsReport' as Text,