add, print --match: prioritise infix matches
If the search description occurs in full within the other description, that match gets a +0.5 score boost.
This commit is contained in:
		
							parent
							
								
									44508f58c8
								
							
						
					
					
						commit
						5e7c4fc7bc
					
				| @ -97,7 +97,7 @@ import "extra" Control.Monad.Extra (whenM) | |||||||
| import Control.Monad.Reader as R | import Control.Monad.Reader as R | ||||||
| import Control.Monad.ST (ST, runST) | import Control.Monad.ST (ST, runST) | ||||||
| import Data.Array.ST (STArray, getElems, newListArray, writeArray) | import Data.Array.ST (STArray, getElems, newListArray, writeArray) | ||||||
| import Data.Char (toUpper) | import Data.Char (toUpper, isDigit) | ||||||
| import Data.Default (Default(..)) | import Data.Default (Default(..)) | ||||||
| import Data.Function ((&)) | import Data.Function ((&)) | ||||||
| import qualified Data.HashTable.Class as H (toList) | import qualified Data.HashTable.Class as H (toList) | ||||||
| @ -328,33 +328,37 @@ journalAccountNameTree = accountNameTreeFrom . journalAccountNames | |||||||
| -- the given transaction description and query. Transactions are | -- the given transaction description and query. Transactions are | ||||||
| -- listed with their description's similarity score (see | -- listed with their description's similarity score (see | ||||||
| -- compareDescriptions), sorted by highest score and then by date. | -- compareDescriptions), sorted by highest score and then by date. | ||||||
| -- Only transactions with a similarity score greater than the minimum | -- Only transactions with a similarity score greater than a minimum | ||||||
| -- threshold (currently 0) are returned. | -- threshold (currently 0) are returned. | ||||||
| journalTransactionsSimilarTo :: Journal -> Query -> Text -> Int -> [(Double,Transaction)] | journalTransactionsSimilarTo :: Journal -> Query -> Text -> Int -> [(Double,Transaction)] | ||||||
| journalTransactionsSimilarTo j q desc n = | journalTransactionsSimilarTo Journal{jtxns} q desc n = | ||||||
|   take n $ |   take n $ | ||||||
|   sortBy (\(s1,t1) (s2,t2) -> compare (s2,tdate t2) (s1,tdate t1)) $ |   sortBy (\(s1,t1) (s2,t2) -> compare (s2,tdate t2) (s1,tdate t1)) $ | ||||||
|   filter ((> threshold).fst) |   filter ((> threshold).fst) | ||||||
|   [(compareDescriptions desc $ tdescription t, t) | t <- ts] |   [(compareDescriptions desc $ tdescription t, t) | t <- jtxns, q `matchesTransaction` t] | ||||||
|   where |   where | ||||||
|     ts = filter (q `matchesTransaction`) $ jtxns j |  | ||||||
|     threshold = 0 |     threshold = 0 | ||||||
| 
 | 
 | ||||||
| -- | Return a similarity measure, from 0 to 1, for two transaction | -- | Return a similarity score from 0 to 1.5 for two transaction descriptions.  | ||||||
| -- descriptions.  This is like compareStrings, but first strips out | -- This is based on compareStrings, with the following modifications: | ||||||
| -- any numbers, to improve accuracy eg when there are bank transaction | -- | ||||||
| -- ids from imported data. | -- - numbers are stripped out before measuring similarity | ||||||
|  | -- | ||||||
|  | -- - if the (unstripped) first description appears in its entirety within the second, | ||||||
|  | --   the score is boosted by 0.5. | ||||||
|  | -- | ||||||
| compareDescriptions :: Text -> Text -> Double | compareDescriptions :: Text -> Text -> Double | ||||||
| compareDescriptions s t = compareStrings s' t' | compareDescriptions a b = | ||||||
|     where s' = simplify $ T.unpack s |   (if a `T.isInfixOf` b then (0.5+) else id) $ | ||||||
|           t' = simplify $ T.unpack t |   compareStrings (simplify a) (simplify b) | ||||||
|           simplify = filter (not . (`elem` ("0123456789" :: String))) |   where | ||||||
|  |     simplify = T.unpack . T.filter (not.isDigit) | ||||||
| 
 | 
 | ||||||
| -- | Return a similarity measure, from 0 to 1, for two strings.  This | -- | Return a similarity score from 0 to 1 for two strings.  This | ||||||
| -- was based on Simon White's string similarity algorithm | -- was based on Simon White's string similarity algorithm | ||||||
| -- (http://www.catalysoft.com/articles/StrikeAMatch.html), later found | -- (http://www.catalysoft.com/articles/StrikeAMatch.html), later found | ||||||
| -- to be https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient, | -- to be https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient, | ||||||
| -- modified to handle short strings better. | -- and modified to handle short strings better. | ||||||
| -- Todo: check out http://nlp.fi.muni.cz/raslan/2008/raslan08.pdf#page=14 . | -- Todo: check out http://nlp.fi.muni.cz/raslan/2008/raslan08.pdf#page=14 . | ||||||
| compareStrings :: String -> String -> Double | compareStrings :: String -> String -> Double | ||||||
| compareStrings "" "" = 1 | compareStrings "" "" = 1 | ||||||
| @ -362,14 +366,16 @@ compareStrings [_] "" = 0 | |||||||
| compareStrings "" [_] = 0 | compareStrings "" [_] = 0 | ||||||
| compareStrings [a] [b] = if toUpper a == toUpper b then 1 else 0 | compareStrings [a] [b] = if toUpper a == toUpper b then 1 else 0 | ||||||
| compareStrings s1 s2 = 2 * commonpairs / totalpairs | compareStrings s1 s2 = 2 * commonpairs / totalpairs | ||||||
|     where |   where | ||||||
|       pairs1      = S.fromList $ wordLetterPairs $ uppercase s1 |     pairs1      = S.fromList $ wordLetterPairs $ uppercase s1 | ||||||
|       pairs2      = S.fromList $ wordLetterPairs $ uppercase s2 |     pairs2      = S.fromList $ wordLetterPairs $ uppercase s2 | ||||||
|       commonpairs = fromIntegral $ S.size $ S.intersection pairs1 pairs2 |     commonpairs = fromIntegral $ S.size $ S.intersection pairs1 pairs2 | ||||||
|       totalpairs  = fromIntegral $ S.size pairs1 + S.size pairs2 |     totalpairs  = fromIntegral $ S.size pairs1 + S.size pairs2 | ||||||
| 
 | 
 | ||||||
|  | wordLetterPairs :: String -> [String] | ||||||
| wordLetterPairs = concatMap letterPairs . words | wordLetterPairs = concatMap letterPairs . words | ||||||
| 
 | 
 | ||||||
|  | letterPairs :: String -> [String] | ||||||
| letterPairs (a:b:rest) = [a,b] : letterPairs (b:rest) | letterPairs (a:b:rest) = [a,b] : letterPairs (b:rest) | ||||||
| letterPairs _ = [] | letterPairs _ = [] | ||||||
| 
 | 
 | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user