add, print --match: prioritise infix matches

If the search description occurs in full within the other description,
that match gets a +0.5 score boost.
This commit is contained in:
Simon Michael 2021-02-20 13:42:40 -08:00
parent 44508f58c8
commit 5e7c4fc7bc

View File

@ -97,7 +97,7 @@ import "extra" Control.Monad.Extra (whenM)
import Control.Monad.Reader as R import Control.Monad.Reader as R
import Control.Monad.ST (ST, runST) import Control.Monad.ST (ST, runST)
import Data.Array.ST (STArray, getElems, newListArray, writeArray) import Data.Array.ST (STArray, getElems, newListArray, writeArray)
import Data.Char (toUpper) import Data.Char (toUpper, isDigit)
import Data.Default (Default(..)) import Data.Default (Default(..))
import Data.Function ((&)) import Data.Function ((&))
import qualified Data.HashTable.Class as H (toList) import qualified Data.HashTable.Class as H (toList)
@ -328,33 +328,37 @@ journalAccountNameTree = accountNameTreeFrom . journalAccountNames
-- the given transaction description and query. Transactions are -- the given transaction description and query. Transactions are
-- listed with their description's similarity score (see -- listed with their description's similarity score (see
-- compareDescriptions), sorted by highest score and then by date. -- compareDescriptions), sorted by highest score and then by date.
-- Only transactions with a similarity score greater than the minimum -- Only transactions with a similarity score greater than a minimum
-- threshold (currently 0) are returned. -- threshold (currently 0) are returned.
journalTransactionsSimilarTo :: Journal -> Query -> Text -> Int -> [(Double,Transaction)] journalTransactionsSimilarTo :: Journal -> Query -> Text -> Int -> [(Double,Transaction)]
journalTransactionsSimilarTo j q desc n = journalTransactionsSimilarTo Journal{jtxns} q desc n =
take n $ take n $
sortBy (\(s1,t1) (s2,t2) -> compare (s2,tdate t2) (s1,tdate t1)) $ sortBy (\(s1,t1) (s2,t2) -> compare (s2,tdate t2) (s1,tdate t1)) $
filter ((> threshold).fst) filter ((> threshold).fst)
[(compareDescriptions desc $ tdescription t, t) | t <- ts] [(compareDescriptions desc $ tdescription t, t) | t <- jtxns, q `matchesTransaction` t]
where where
ts = filter (q `matchesTransaction`) $ jtxns j
threshold = 0 threshold = 0
-- | Return a similarity measure, from 0 to 1, for two transaction -- | Return a similarity score from 0 to 1.5 for two transaction descriptions.
-- descriptions. This is like compareStrings, but first strips out -- This is based on compareStrings, with the following modifications:
-- any numbers, to improve accuracy eg when there are bank transaction --
-- ids from imported data. -- - numbers are stripped out before measuring similarity
--
-- - if the (unstripped) first description appears in its entirety within the second,
-- the score is boosted by 0.5.
--
compareDescriptions :: Text -> Text -> Double compareDescriptions :: Text -> Text -> Double
compareDescriptions s t = compareStrings s' t' compareDescriptions a b =
where s' = simplify $ T.unpack s (if a `T.isInfixOf` b then (0.5+) else id) $
t' = simplify $ T.unpack t compareStrings (simplify a) (simplify b)
simplify = filter (not . (`elem` ("0123456789" :: String))) where
simplify = T.unpack . T.filter (not.isDigit)
-- | Return a similarity measure, from 0 to 1, for two strings. This -- | Return a similarity score from 0 to 1 for two strings. This
-- was based on Simon White's string similarity algorithm -- was based on Simon White's string similarity algorithm
-- (http://www.catalysoft.com/articles/StrikeAMatch.html), later found -- (http://www.catalysoft.com/articles/StrikeAMatch.html), later found
-- to be https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient, -- to be https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient,
-- modified to handle short strings better. -- and modified to handle short strings better.
-- Todo: check out http://nlp.fi.muni.cz/raslan/2008/raslan08.pdf#page=14 . -- Todo: check out http://nlp.fi.muni.cz/raslan/2008/raslan08.pdf#page=14 .
compareStrings :: String -> String -> Double compareStrings :: String -> String -> Double
compareStrings "" "" = 1 compareStrings "" "" = 1
@ -362,14 +366,16 @@ compareStrings [_] "" = 0
compareStrings "" [_] = 0 compareStrings "" [_] = 0
compareStrings [a] [b] = if toUpper a == toUpper b then 1 else 0 compareStrings [a] [b] = if toUpper a == toUpper b then 1 else 0
compareStrings s1 s2 = 2 * commonpairs / totalpairs compareStrings s1 s2 = 2 * commonpairs / totalpairs
where where
pairs1 = S.fromList $ wordLetterPairs $ uppercase s1 pairs1 = S.fromList $ wordLetterPairs $ uppercase s1
pairs2 = S.fromList $ wordLetterPairs $ uppercase s2 pairs2 = S.fromList $ wordLetterPairs $ uppercase s2
commonpairs = fromIntegral $ S.size $ S.intersection pairs1 pairs2 commonpairs = fromIntegral $ S.size $ S.intersection pairs1 pairs2
totalpairs = fromIntegral $ S.size pairs1 + S.size pairs2 totalpairs = fromIntegral $ S.size pairs1 + S.size pairs2
wordLetterPairs :: String -> [String]
wordLetterPairs = concatMap letterPairs . words wordLetterPairs = concatMap letterPairs . words
letterPairs :: String -> [String]
letterPairs (a:b:rest) = [a,b] : letterPairs (b:rest) letterPairs (a:b:rest) = [a,b] : letterPairs (b:rest)
letterPairs _ = [] letterPairs _ = []