diff --git a/hledger-lib/Hledger/Data/Journal.hs b/hledger-lib/Hledger/Data/Journal.hs index 56d9007b0..d68eedcb4 100644 --- a/hledger-lib/Hledger/Data/Journal.hs +++ b/hledger-lib/Hledger/Data/Journal.hs @@ -67,6 +67,7 @@ module Hledger.Data.Journal ( journalNextTransaction, journalPrevTransaction, journalPostings, + journalTransactionsSimilarTo, -- journalPrices, -- * Standard account types journalBalanceSheetAccountQuery, @@ -96,6 +97,7 @@ import "extra" Control.Monad.Extra (whenM) import Control.Monad.Reader as R import Control.Monad.ST (ST, runST) import Data.Array.ST (STArray, getElems, newListArray, writeArray) +import Data.Char (toUpper) import Data.Default (Default(..)) import Data.Function ((&)) import qualified Data.HashTable.Class as H (toList) @@ -125,6 +127,7 @@ import Hledger.Data.Transaction import Hledger.Data.TransactionModifier import Hledger.Data.Posting import Hledger.Query +import Data.List (sortBy) -- try to make Journal ppShow-compatible @@ -321,6 +324,55 @@ journalAccountNames = journalAccountNamesDeclaredOrImplied journalAccountNameTree :: Journal -> Tree AccountName journalAccountNameTree = accountNameTreeFrom . journalAccountNames +-- | Find up to N most similar and most recent transactions matching +-- the given transaction description and query. Transactions are +-- listed with their description's similarity score (see +-- compareDescriptions), sorted by highest score and then by date. +-- Only transactions with a similarity score greater than the minimum +-- threshold (currently 0) are returned. +journalTransactionsSimilarTo :: Journal -> Query -> Text -> Int -> [(Double,Transaction)] +journalTransactionsSimilarTo j q desc n = + take n $ + sortBy (\(s1,t1) (s2,t2) -> compare (s2,tdate t2) (s1,tdate t1)) $ + filter ((> threshold).fst) + [(compareDescriptions desc $ tdescription t, t) | t <- ts] + where + ts = filter (q `matchesTransaction`) $ jtxns j + threshold = 0 + +-- | Return a similarity measure, from 0 to 1, for two transaction +-- descriptions. This is like compareStrings, but first strips out +-- any numbers, to improve accuracy eg when there are bank transaction +-- ids from imported data. +compareDescriptions :: Text -> Text -> Double +compareDescriptions s t = compareStrings s' t' + where s' = simplify $ T.unpack s + t' = simplify $ T.unpack t + simplify = filter (not . (`elem` ("0123456789" :: String))) + +-- | Return a similarity measure, from 0 to 1, for two strings. This +-- was based on Simon White's string similarity algorithm +-- (http://www.catalysoft.com/articles/StrikeAMatch.html), later found +-- to be https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient, +-- modified to handle short strings better. +-- Todo: check out http://nlp.fi.muni.cz/raslan/2008/raslan08.pdf#page=14 . +compareStrings :: String -> String -> Double +compareStrings "" "" = 1 +compareStrings [_] "" = 0 +compareStrings "" [_] = 0 +compareStrings [a] [b] = if toUpper a == toUpper b then 1 else 0 +compareStrings s1 s2 = 2 * commonpairs / totalpairs + where + pairs1 = S.fromList $ wordLetterPairs $ uppercase s1 + pairs2 = S.fromList $ wordLetterPairs $ uppercase s2 + commonpairs = fromIntegral $ S.size $ S.intersection pairs1 pairs2 + totalpairs = fromIntegral $ S.size pairs1 + S.size pairs2 + +wordLetterPairs = concatMap letterPairs . words + +letterPairs (a:b:rest) = [a,b] : letterPairs (b:rest) +letterPairs _ = [] + -- queries for standard account types -- | Get a query for accounts of the specified types in this journal. diff --git a/hledger/Hledger/Cli/Commands/Add.hs b/hledger/Hledger/Cli/Commands/Add.hs index df6f65dc3..2cd37dced 100644 --- a/hledger/Hledger/Cli/Commands/Add.hs +++ b/hledger/Hledger/Cli/Commands/Add.hs @@ -11,7 +11,6 @@ module Hledger.Cli.Commands.Add ( ,add ,appendToJournalFileOrStdout ,journalAddTransaction - ,transactionsSimilarTo ) where @@ -26,7 +25,6 @@ import Data.Char (toUpper, toLower) import Data.Either (isRight) import Data.Functor.Identity (Identity(..)) import "base-compat-batteries" Data.List.Compat -import qualified Data.Set as S import Data.Maybe (fromJust, fromMaybe, isJust) import Data.Text (Text) import qualified Data.Text as T @@ -49,6 +47,7 @@ import Text.Printf import Hledger import Hledger.Cli.CliOptions import Hledger.Cli.Commands.Register (postingsReportAsText) +import Hledger.Cli.Utils (journalSimilarTransaction) addmode = hledgerCommandMode @@ -176,7 +175,7 @@ confirmedTransactionWizard prevInput es@EntryState{..} stack@(currentStage : _) EnterDescAndComment (date, code) -> descriptionAndCommentWizard prevInput es >>= \case Just (desc, comment) -> do - let mbaset = similarTransaction es desc + let mbaset = journalSimilarTransaction esOpts esJournal desc es' = es { esArgs = drop 1 esArgs , esPostings = [] @@ -258,15 +257,6 @@ confirmedTransactionWizard prevInput es@EntryState{..} stack@(currentStage : _) where replaceNthOrAppend n newElem xs = take n xs ++ [newElem] ++ drop (n + 1) xs --- Identify the closest recent match for this description in past transactions. -similarTransaction :: EntryState -> Text -> Maybe Transaction -similarTransaction EntryState{..} desc = - let q = queryFromFlags . rsOpts $ reportspec_ esOpts - historymatches = transactionsSimilarTo esJournal q desc - bestmatch | null historymatches = Nothing - | otherwise = Just $ snd $ head historymatches - in bestmatch - dateAndCodeWizard PrevInput{..} EntryState{..} = do let def = headDef (T.unpack $ showDate esDefDate) esArgs retryMsg "A valid hledger smart date is required. Eg: 2014/2/14, 14, yesterday." $ @@ -482,49 +472,3 @@ registerFromString s = do capitalize :: String -> String capitalize "" = "" capitalize (c:cs) = toUpper c : cs - --- | Find the most similar and recent transactions matching the given --- transaction description and report query. Transactions are listed --- with their "relevancy" score, most relevant first. -transactionsSimilarTo :: Journal -> Query -> Text -> [(Double,Transaction)] -transactionsSimilarTo j q desc = - sortBy compareRelevanceAndRecency - $ filter ((> threshold).fst) - [(compareDescriptions desc $ tdescription t, t) | t <- ts] - where - compareRelevanceAndRecency (n1,t1) (n2,t2) = compare (n2,tdate t2) (n1,tdate t1) - ts = filter (q `matchesTransaction`) $ jtxns j - threshold = 0 - --- | Return a similarity measure, from 0 to 1, for two transaction --- descriptions. This is like compareStrings, but first strips out --- any numbers, to improve accuracy eg when there are bank transaction --- ids from imported data. -compareDescriptions :: Text -> Text -> Double -compareDescriptions s t = compareStrings s' t' - where s' = simplify $ T.unpack s - t' = simplify $ T.unpack t - simplify = filter (not . (`elem` ("0123456789" :: String))) - --- | Return a similarity measure, from 0 to 1, for two strings. This --- was based on Simon White's string similarity algorithm --- (http://www.catalysoft.com/articles/StrikeAMatch.html), later found --- to be https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient, --- modified to handle short strings better. --- Todo: check out http://nlp.fi.muni.cz/raslan/2008/raslan08.pdf#page=14 . -compareStrings :: String -> String -> Double -compareStrings "" "" = 1 -compareStrings [_] "" = 0 -compareStrings "" [_] = 0 -compareStrings [a] [b] = if toUpper a == toUpper b then 1 else 0 -compareStrings s1 s2 = 2 * commonpairs / totalpairs - where - pairs1 = S.fromList $ wordLetterPairs $ uppercase s1 - pairs2 = S.fromList $ wordLetterPairs $ uppercase s2 - commonpairs = fromIntegral $ S.size $ S.intersection pairs1 pairs2 - totalpairs = fromIntegral $ S.size pairs1 + S.size pairs2 - -wordLetterPairs = concatMap letterPairs . words - -letterPairs (a:b:rest) = [a,b] : letterPairs (b:rest) -letterPairs _ = [] diff --git a/hledger/Hledger/Cli/Commands/Print.hs b/hledger/Hledger/Cli/Commands/Print.hs index fea2eaf5d..b55347384 100644 --- a/hledger/Hledger/Cli/Commands/Print.hs +++ b/hledger/Hledger/Cli/Commands/Print.hs @@ -32,7 +32,6 @@ import Hledger.Read.CsvReader (CSV, printCSV) import Hledger import Hledger.Cli.CliOptions import Hledger.Cli.Utils -import Hledger.Cli.Commands.Add ( transactionsSimilarTo ) printmode = hledgerCommandMode @@ -194,16 +193,7 @@ postingToCSV p = -- | Print the transaction most closely and recently matching a description -- (and the query, if any). printMatch :: CliOpts -> Journal -> Text -> IO () -printMatch CliOpts{reportspec_=rspec} j desc = do - case similarTransaction' j (rsQuery rspec) desc of - Nothing -> putStrLn "no matches found." - Just t -> T.putStr $ showTransaction t - - where - -- Identify the closest recent match for this description in past transactions. - similarTransaction' :: Journal -> Query -> Text -> Maybe Transaction - similarTransaction' j q desc - | null historymatches = Nothing - | otherwise = Just $ snd $ head historymatches - where - historymatches = transactionsSimilarTo j q desc +printMatch opts j desc = do + case journalSimilarTransaction opts j desc of + Nothing -> putStrLn "no matches found." + Just t -> T.putStr $ showTransaction t diff --git a/hledger/Hledger/Cli/Utils.hs b/hledger/Hledger/Cli/Utils.hs index b8bbb2e60..4bb092ce5 100644 --- a/hledger/Hledger/Cli/Utils.hs +++ b/hledger/Hledger/Cli/Utils.hs @@ -26,6 +26,7 @@ module Hledger.Cli.Utils pivotByOpts, anonymiseByOpts, utcTimeToClockTime, + journalSimilarTransaction, tests_Cli_Utils, ) where @@ -38,7 +39,7 @@ import qualified Data.Text.IO as T import qualified Data.Text.Lazy as TL import qualified Data.Text.Lazy.IO as TL import Data.Time (UTCTime, Day, addDays) -import Safe (readMay) +import Safe (readMay, headMay) import System.Console.CmdArgs import System.Directory (getModificationTime, getDirectoryContents, copyFile, doesFileExist) import System.Exit @@ -302,6 +303,17 @@ backupNumber f g = case g =~ ("^" ++ f ++ "\\.([0-9]+)$") of (_::FilePath, _::FilePath, _::FilePath, [ext::FilePath]) -> readMay ext _ -> Nothing +-- Identify the closest recent match for this description in past transactions. +-- If the options specify a query, only matched transactions are considered. +journalSimilarTransaction :: CliOpts -> Journal -> T.Text -> Maybe Transaction +journalSimilarTransaction cliopts j desc = mbestmatch + where + mbestmatch = snd <$> headMay bestmatches + bestmatches = + dbg1With (unlines . ("similar transactions:":) . map (\(score,Transaction{..}) -> printf "%0.3f %s %s" score (show tdate) tdescription)) $ + journalTransactionsSimilarTo j q desc 10 + q = queryFromFlags $ rsOpts $ reportspec_ cliopts + tests_Cli_Utils = tests "Utils" [ -- tests "journalApplyValue" [