add, print --match: prioritise infix matches

If the search description occurs in full within the other description, that match gets a +0.5 score boost.
2021-02-20 13:42:40 -08:00 · 2021-02-20 13:42:40 -08:00 · 5e7c4fc7bc
commit 5e7c4fc7bc
parent 44508f58c8
1 changed files with 26 additions and 20 deletions
--- a/hledger-lib/Hledger/Data/Journal.hs
+++ b/hledger-lib/Hledger/Data/Journal.hs
@ -97,7 +97,7 @@ import "extra" Control.Monad.Extra (whenM)
 import Control.Monad.Reader as R
 import Control.Monad.ST (ST, runST)
 import Data.Array.ST (STArray, getElems, newListArray, writeArray)
-import Data.Char (toUpper)
+import Data.Char (toUpper, isDigit)
 import Data.Default (Default(..))
 import Data.Function ((&))
 import qualified Data.HashTable.Class as H (toList)
@ -328,33 +328,37 @@ journalAccountNameTree = accountNameTreeFrom . journalAccountNames
 -- the given transaction description and query. Transactions are
 -- listed with their description's similarity score (see
 -- compareDescriptions), sorted by highest score and then by date.
-- Only transactions with a similarity score greater than the minimum
+-- Only transactions with a similarity score greater than a minimum
 -- threshold (currently 0) are returned.
 journalTransactionsSimilarTo :: Journal -> Query -> Text -> Int -> [(Double,Transaction)]
-journalTransactionsSimilarTo j q desc n =
+journalTransactionsSimilarTo Journal{jtxns} q desc n =
  take n $
  sortBy (\(s1,t1) (s2,t2) -> compare (s2,tdate t2) (s1,tdate t1)) $
  filter ((> threshold).fst)
-  [(compareDescriptions desc $ tdescription t, t) | t <- ts]
+  [(compareDescriptions desc $ tdescription t, t) | t <- jtxns, q `matchesTransaction` t]
  where
-    ts = filter (q `matchesTransaction`) $ jtxns j
    threshold = 0

-- | Return a similarity measure, from 0 to 1, for two transaction
-- descriptions.  This is like compareStrings, but first strips out
-- any numbers, to improve accuracy eg when there are bank transaction
-- ids from imported data.
+-- | Return a similarity score from 0 to 1.5 for two transaction descriptions. 
+-- This is based on compareStrings, with the following modifications:
+--
+-- - numbers are stripped out before measuring similarity
+--
+-- - if the (unstripped) first description appears in its entirety within the second,
+--   the score is boosted by 0.5.
+--
 compareDescriptions :: Text -> Text -> Double
-compareDescriptions s t = compareStrings s' t'
-    where s' = simplify $ T.unpack s
-          t' = simplify $ T.unpack t
-          simplify = filter (not . (`elem` ("0123456789" :: String)))
+compareDescriptions a b =
+  (if a `T.isInfixOf` b then (0.5+) else id) $
+  compareStrings (simplify a) (simplify b)
+  where
+    simplify = T.unpack . T.filter (not.isDigit)

-- | Return a similarity measure, from 0 to 1, for two strings.  This
+-- | Return a similarity score from 0 to 1 for two strings.  This
 -- was based on Simon White's string similarity algorithm
 -- (http://www.catalysoft.com/articles/StrikeAMatch.html), later found
 -- to be https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient,
-- modified to handle short strings better.
+-- and modified to handle short strings better.
 -- Todo: check out http://nlp.fi.muni.cz/raslan/2008/raslan08.pdf#page=14 .
 compareStrings :: String -> String -> Double
 compareStrings "" "" = 1
@ -368,8 +372,10 @@ compareStrings s1 s2 = 2 * commonpairs / totalpairs
    commonpairs = fromIntegral $ S.size $ S.intersection pairs1 pairs2
    totalpairs  = fromIntegral $ S.size pairs1 + S.size pairs2

+wordLetterPairs :: String -> [String]
 wordLetterPairs = concatMap letterPairs . words

+letterPairs :: String -> [String]
 letterPairs (a:b:rest) = [a,b] : letterPairs (b:rest)
 letterPairs _ = []