imp: print: --match makes better choices

Previously, similarity completely outweighed recency, so a
slightly-more-similar transaction would always be selected no matter
how old it was. Now similarity and recency are more balanced,
and it should produce the desired transaction more often.
There is also new debug output (at debug level 1) for
troubleshooting.
This commit is contained in:
Simon Michael 2023-03-27 15:13:49 -10:00
parent 6bf2afe80c
commit c03d6b1123
2 changed files with 36 additions and 17 deletions

View File

@ -3,6 +3,7 @@
{-# LANGUAGE Rank2Types #-} {-# LANGUAGE Rank2Types #-}
{-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TupleSections #-} {-# LANGUAGE TupleSections #-}
{-# LANGUAGE RecordWildCards #-}
{-| {-|
@ -122,8 +123,8 @@ import Data.Maybe (catMaybes, fromMaybe, mapMaybe, maybeToList)
import qualified Data.Set as S import qualified Data.Set as S
import Data.Text (Text) import Data.Text (Text)
import qualified Data.Text as T import qualified Data.Text as T
import Safe (headMay, headDef, maximumMay, minimumMay) import Safe (headMay, headDef, maximumMay, minimumMay, lastDef)
import Data.Time.Calendar (Day, addDays, fromGregorian) import Data.Time.Calendar (Day, addDays, fromGregorian, diffDays)
import Data.Time.Clock.POSIX (POSIXTime) import Data.Time.Clock.POSIX (POSIXTime)
import Data.Tree (Tree(..), flatten) import Data.Tree (Tree(..), flatten)
import Text.Printf (printf) import Text.Printf (printf)
@ -140,6 +141,9 @@ import Hledger.Data.TransactionModifier
import Hledger.Data.Valuation import Hledger.Data.Valuation
import Hledger.Query import Hledger.Query
import System.FilePath (takeFileName) import System.FilePath (takeFileName)
import Data.Ord (comparing)
import Hledger.Data.Dates (nulldate)
import Data.List (sort)
-- | A parser of text that runs in some monad, keeping a Journal as state. -- | A parser of text that runs in some monad, keeping a Journal as state.
@ -429,20 +433,38 @@ journalInheritedAccountTags j a =
as = a : parentAccountNames a as = a : parentAccountNames a
-- PERF: cache in journal ? -- PERF: cache in journal ?
type DateWeightedSimilarityScore = Double
type SimilarityScore = Double
type Age = Integer
-- | Find up to N most similar and most recent transactions matching -- | Find up to N most similar and most recent transactions matching
-- the given transaction description and query. Transactions are -- the given transaction description and query and exceeding the given
-- listed with their description's similarity score (see -- description similarity score (0 to 1, see compareDescriptions).
-- compareDescriptions), sorted by highest score and then by date. -- Returns transactions along with
-- Only transactions with a similarity score greater than a minimum -- their age in days compared to the latest transaction date,
-- threshold (currently 0) are returned. -- their description similarity score,
journalTransactionsSimilarTo :: Journal -> Query -> Text -> Int -> [(Double,Transaction)] -- and a heuristically date-weighted variant of this that favours more recent transactions.
journalTransactionsSimilarTo Journal{jtxns} q desc n = journalTransactionsSimilarTo :: Journal -> Text -> Query -> SimilarityScore -> Int
-> [(DateWeightedSimilarityScore, Age, SimilarityScore, Transaction)]
journalTransactionsSimilarTo Journal{jtxns} desc q similaritythreshold n =
take n $ take n $
sortBy (\(s1,t1) (s2,t2) -> compare (s2,tdate t2) (s1,tdate t1)) $ dbg1With (
filter ((> threshold).fst) unlines .
("up to 30 transactions above description similarity threshold "<>show similaritythreshold<>" ordered by recency-weighted similarity:":) .
take 30 .
map ( \(w,a,s,Transaction{..}) -> printf "weighted:%8.3f age:%4d similarity:%5.3f %s %s" w a s (show tdate) tdescription )) $
sortBy (comparing (negate.first4)) $
map (\(s,t) -> (weightedScore (s,t), age t, s, t)) $
filter ((> similaritythreshold).fst)
[(compareDescriptions desc $ tdescription t, t) | t <- jtxns, q `matchesTransaction` t] [(compareDescriptions desc $ tdescription t, t) | t <- jtxns, q `matchesTransaction` t]
where where
threshold = 0 latest = lastDef nulldate $ sort $ map tdate jtxns
age = diffDays latest . tdate
-- Combine similarity and recency heuristically. This gave decent results
-- in my "find most recent invoice" use case in 2023-03,
-- but will probably need more attention.
weightedScore :: (Double, Transaction) -> Double
weightedScore (s, t) = 100 * s - fromIntegral (age t) / 4
-- | Return a similarity score from 0 to 1.5 for two transaction descriptions. -- | Return a similarity score from 0 to 1.5 for two transaction descriptions.
-- This is based on compareStrings, with the following modifications: -- This is based on compareStrings, with the following modifications:

View File

@ -251,12 +251,9 @@ backupNumber f g = case g =~ ("^" ++ f ++ "\\.([0-9]+)$") of
-- Identify the closest recent match for this description in past transactions. -- Identify the closest recent match for this description in past transactions.
-- If the options specify a query, only matched transactions are considered. -- If the options specify a query, only matched transactions are considered.
journalSimilarTransaction :: CliOpts -> Journal -> T.Text -> Maybe Transaction journalSimilarTransaction :: CliOpts -> Journal -> T.Text -> Maybe Transaction
journalSimilarTransaction cliopts j desc = mbestmatch journalSimilarTransaction cliopts j desc =
fmap fourth4 $ headMay $ journalTransactionsSimilarTo j desc q 0 1
where where
mbestmatch = snd <$> headMay bestmatches
bestmatches =
dbg1With (unlines . ("similar transactions:":) . map (\(score,Transaction{..}) -> printf "%0.3f %s %s" score (show tdate) tdescription)) $
journalTransactionsSimilarTo j q desc 10
q = queryFromFlags $ _rsReportOpts $ reportspec_ cliopts q = queryFromFlags $ _rsReportOpts $ reportspec_ cliopts
-- | Render a 'PostingsReport' or 'AccountTransactionsReport' as Text, -- | Render a 'PostingsReport' or 'AccountTransactionsReport' as Text,