add, lib: debug output, refactor similar transactions lookup
add --debug=1 shows the top hits for similar past transactions. added: Hledger.Cli.Utils.journalSimilarTransaction provides --debug=1 output changed: Hledger.Cli.Commands.Add.transactionsSimilarTo -> Hledger.Data.Journal.journalTransactionsSimilarTo now takes an extra number-of-results argument
This commit is contained in:
parent
7979c7d74c
commit
44508f58c8
@ -67,6 +67,7 @@ module Hledger.Data.Journal (
|
|||||||
journalNextTransaction,
|
journalNextTransaction,
|
||||||
journalPrevTransaction,
|
journalPrevTransaction,
|
||||||
journalPostings,
|
journalPostings,
|
||||||
|
journalTransactionsSimilarTo,
|
||||||
-- journalPrices,
|
-- journalPrices,
|
||||||
-- * Standard account types
|
-- * Standard account types
|
||||||
journalBalanceSheetAccountQuery,
|
journalBalanceSheetAccountQuery,
|
||||||
@ -96,6 +97,7 @@ import "extra" Control.Monad.Extra (whenM)
|
|||||||
import Control.Monad.Reader as R
|
import Control.Monad.Reader as R
|
||||||
import Control.Monad.ST (ST, runST)
|
import Control.Monad.ST (ST, runST)
|
||||||
import Data.Array.ST (STArray, getElems, newListArray, writeArray)
|
import Data.Array.ST (STArray, getElems, newListArray, writeArray)
|
||||||
|
import Data.Char (toUpper)
|
||||||
import Data.Default (Default(..))
|
import Data.Default (Default(..))
|
||||||
import Data.Function ((&))
|
import Data.Function ((&))
|
||||||
import qualified Data.HashTable.Class as H (toList)
|
import qualified Data.HashTable.Class as H (toList)
|
||||||
@ -125,6 +127,7 @@ import Hledger.Data.Transaction
|
|||||||
import Hledger.Data.TransactionModifier
|
import Hledger.Data.TransactionModifier
|
||||||
import Hledger.Data.Posting
|
import Hledger.Data.Posting
|
||||||
import Hledger.Query
|
import Hledger.Query
|
||||||
|
import Data.List (sortBy)
|
||||||
|
|
||||||
|
|
||||||
-- try to make Journal ppShow-compatible
|
-- try to make Journal ppShow-compatible
|
||||||
@ -321,6 +324,55 @@ journalAccountNames = journalAccountNamesDeclaredOrImplied
|
|||||||
journalAccountNameTree :: Journal -> Tree AccountName
|
journalAccountNameTree :: Journal -> Tree AccountName
|
||||||
journalAccountNameTree = accountNameTreeFrom . journalAccountNames
|
journalAccountNameTree = accountNameTreeFrom . journalAccountNames
|
||||||
|
|
||||||
|
-- | Find up to N most similar and most recent transactions matching
|
||||||
|
-- the given transaction description and query. Transactions are
|
||||||
|
-- listed with their description's similarity score (see
|
||||||
|
-- compareDescriptions), sorted by highest score and then by date.
|
||||||
|
-- Only transactions with a similarity score greater than the minimum
|
||||||
|
-- threshold (currently 0) are returned.
|
||||||
|
journalTransactionsSimilarTo :: Journal -> Query -> Text -> Int -> [(Double,Transaction)]
|
||||||
|
journalTransactionsSimilarTo j q desc n =
|
||||||
|
take n $
|
||||||
|
sortBy (\(s1,t1) (s2,t2) -> compare (s2,tdate t2) (s1,tdate t1)) $
|
||||||
|
filter ((> threshold).fst)
|
||||||
|
[(compareDescriptions desc $ tdescription t, t) | t <- ts]
|
||||||
|
where
|
||||||
|
ts = filter (q `matchesTransaction`) $ jtxns j
|
||||||
|
threshold = 0
|
||||||
|
|
||||||
|
-- | Return a similarity measure, from 0 to 1, for two transaction
|
||||||
|
-- descriptions. This is like compareStrings, but first strips out
|
||||||
|
-- any numbers, to improve accuracy eg when there are bank transaction
|
||||||
|
-- ids from imported data.
|
||||||
|
compareDescriptions :: Text -> Text -> Double
|
||||||
|
compareDescriptions s t = compareStrings s' t'
|
||||||
|
where s' = simplify $ T.unpack s
|
||||||
|
t' = simplify $ T.unpack t
|
||||||
|
simplify = filter (not . (`elem` ("0123456789" :: String)))
|
||||||
|
|
||||||
|
-- | Return a similarity measure, from 0 to 1, for two strings. This
|
||||||
|
-- was based on Simon White's string similarity algorithm
|
||||||
|
-- (http://www.catalysoft.com/articles/StrikeAMatch.html), later found
|
||||||
|
-- to be https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient,
|
||||||
|
-- modified to handle short strings better.
|
||||||
|
-- Todo: check out http://nlp.fi.muni.cz/raslan/2008/raslan08.pdf#page=14 .
|
||||||
|
compareStrings :: String -> String -> Double
|
||||||
|
compareStrings "" "" = 1
|
||||||
|
compareStrings [_] "" = 0
|
||||||
|
compareStrings "" [_] = 0
|
||||||
|
compareStrings [a] [b] = if toUpper a == toUpper b then 1 else 0
|
||||||
|
compareStrings s1 s2 = 2 * commonpairs / totalpairs
|
||||||
|
where
|
||||||
|
pairs1 = S.fromList $ wordLetterPairs $ uppercase s1
|
||||||
|
pairs2 = S.fromList $ wordLetterPairs $ uppercase s2
|
||||||
|
commonpairs = fromIntegral $ S.size $ S.intersection pairs1 pairs2
|
||||||
|
totalpairs = fromIntegral $ S.size pairs1 + S.size pairs2
|
||||||
|
|
||||||
|
wordLetterPairs = concatMap letterPairs . words
|
||||||
|
|
||||||
|
letterPairs (a:b:rest) = [a,b] : letterPairs (b:rest)
|
||||||
|
letterPairs _ = []
|
||||||
|
|
||||||
-- queries for standard account types
|
-- queries for standard account types
|
||||||
|
|
||||||
-- | Get a query for accounts of the specified types in this journal.
|
-- | Get a query for accounts of the specified types in this journal.
|
||||||
|
|||||||
@ -11,7 +11,6 @@ module Hledger.Cli.Commands.Add (
|
|||||||
,add
|
,add
|
||||||
,appendToJournalFileOrStdout
|
,appendToJournalFileOrStdout
|
||||||
,journalAddTransaction
|
,journalAddTransaction
|
||||||
,transactionsSimilarTo
|
|
||||||
)
|
)
|
||||||
where
|
where
|
||||||
|
|
||||||
@ -26,7 +25,6 @@ import Data.Char (toUpper, toLower)
|
|||||||
import Data.Either (isRight)
|
import Data.Either (isRight)
|
||||||
import Data.Functor.Identity (Identity(..))
|
import Data.Functor.Identity (Identity(..))
|
||||||
import "base-compat-batteries" Data.List.Compat
|
import "base-compat-batteries" Data.List.Compat
|
||||||
import qualified Data.Set as S
|
|
||||||
import Data.Maybe (fromJust, fromMaybe, isJust)
|
import Data.Maybe (fromJust, fromMaybe, isJust)
|
||||||
import Data.Text (Text)
|
import Data.Text (Text)
|
||||||
import qualified Data.Text as T
|
import qualified Data.Text as T
|
||||||
@ -49,6 +47,7 @@ import Text.Printf
|
|||||||
import Hledger
|
import Hledger
|
||||||
import Hledger.Cli.CliOptions
|
import Hledger.Cli.CliOptions
|
||||||
import Hledger.Cli.Commands.Register (postingsReportAsText)
|
import Hledger.Cli.Commands.Register (postingsReportAsText)
|
||||||
|
import Hledger.Cli.Utils (journalSimilarTransaction)
|
||||||
|
|
||||||
|
|
||||||
addmode = hledgerCommandMode
|
addmode = hledgerCommandMode
|
||||||
@ -176,7 +175,7 @@ confirmedTransactionWizard prevInput es@EntryState{..} stack@(currentStage : _)
|
|||||||
|
|
||||||
EnterDescAndComment (date, code) -> descriptionAndCommentWizard prevInput es >>= \case
|
EnterDescAndComment (date, code) -> descriptionAndCommentWizard prevInput es >>= \case
|
||||||
Just (desc, comment) -> do
|
Just (desc, comment) -> do
|
||||||
let mbaset = similarTransaction es desc
|
let mbaset = journalSimilarTransaction esOpts esJournal desc
|
||||||
es' = es
|
es' = es
|
||||||
{ esArgs = drop 1 esArgs
|
{ esArgs = drop 1 esArgs
|
||||||
, esPostings = []
|
, esPostings = []
|
||||||
@ -258,15 +257,6 @@ confirmedTransactionWizard prevInput es@EntryState{..} stack@(currentStage : _)
|
|||||||
where
|
where
|
||||||
replaceNthOrAppend n newElem xs = take n xs ++ [newElem] ++ drop (n + 1) xs
|
replaceNthOrAppend n newElem xs = take n xs ++ [newElem] ++ drop (n + 1) xs
|
||||||
|
|
||||||
-- Identify the closest recent match for this description in past transactions.
|
|
||||||
similarTransaction :: EntryState -> Text -> Maybe Transaction
|
|
||||||
similarTransaction EntryState{..} desc =
|
|
||||||
let q = queryFromFlags . rsOpts $ reportspec_ esOpts
|
|
||||||
historymatches = transactionsSimilarTo esJournal q desc
|
|
||||||
bestmatch | null historymatches = Nothing
|
|
||||||
| otherwise = Just $ snd $ head historymatches
|
|
||||||
in bestmatch
|
|
||||||
|
|
||||||
dateAndCodeWizard PrevInput{..} EntryState{..} = do
|
dateAndCodeWizard PrevInput{..} EntryState{..} = do
|
||||||
let def = headDef (T.unpack $ showDate esDefDate) esArgs
|
let def = headDef (T.unpack $ showDate esDefDate) esArgs
|
||||||
retryMsg "A valid hledger smart date is required. Eg: 2014/2/14, 14, yesterday." $
|
retryMsg "A valid hledger smart date is required. Eg: 2014/2/14, 14, yesterday." $
|
||||||
@ -482,49 +472,3 @@ registerFromString s = do
|
|||||||
capitalize :: String -> String
|
capitalize :: String -> String
|
||||||
capitalize "" = ""
|
capitalize "" = ""
|
||||||
capitalize (c:cs) = toUpper c : cs
|
capitalize (c:cs) = toUpper c : cs
|
||||||
|
|
||||||
-- | Find the most similar and recent transactions matching the given
|
|
||||||
-- transaction description and report query. Transactions are listed
|
|
||||||
-- with their "relevancy" score, most relevant first.
|
|
||||||
transactionsSimilarTo :: Journal -> Query -> Text -> [(Double,Transaction)]
|
|
||||||
transactionsSimilarTo j q desc =
|
|
||||||
sortBy compareRelevanceAndRecency
|
|
||||||
$ filter ((> threshold).fst)
|
|
||||||
[(compareDescriptions desc $ tdescription t, t) | t <- ts]
|
|
||||||
where
|
|
||||||
compareRelevanceAndRecency (n1,t1) (n2,t2) = compare (n2,tdate t2) (n1,tdate t1)
|
|
||||||
ts = filter (q `matchesTransaction`) $ jtxns j
|
|
||||||
threshold = 0
|
|
||||||
|
|
||||||
-- | Return a similarity measure, from 0 to 1, for two transaction
|
|
||||||
-- descriptions. This is like compareStrings, but first strips out
|
|
||||||
-- any numbers, to improve accuracy eg when there are bank transaction
|
|
||||||
-- ids from imported data.
|
|
||||||
compareDescriptions :: Text -> Text -> Double
|
|
||||||
compareDescriptions s t = compareStrings s' t'
|
|
||||||
where s' = simplify $ T.unpack s
|
|
||||||
t' = simplify $ T.unpack t
|
|
||||||
simplify = filter (not . (`elem` ("0123456789" :: String)))
|
|
||||||
|
|
||||||
-- | Return a similarity measure, from 0 to 1, for two strings. This
|
|
||||||
-- was based on Simon White's string similarity algorithm
|
|
||||||
-- (http://www.catalysoft.com/articles/StrikeAMatch.html), later found
|
|
||||||
-- to be https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient,
|
|
||||||
-- modified to handle short strings better.
|
|
||||||
-- Todo: check out http://nlp.fi.muni.cz/raslan/2008/raslan08.pdf#page=14 .
|
|
||||||
compareStrings :: String -> String -> Double
|
|
||||||
compareStrings "" "" = 1
|
|
||||||
compareStrings [_] "" = 0
|
|
||||||
compareStrings "" [_] = 0
|
|
||||||
compareStrings [a] [b] = if toUpper a == toUpper b then 1 else 0
|
|
||||||
compareStrings s1 s2 = 2 * commonpairs / totalpairs
|
|
||||||
where
|
|
||||||
pairs1 = S.fromList $ wordLetterPairs $ uppercase s1
|
|
||||||
pairs2 = S.fromList $ wordLetterPairs $ uppercase s2
|
|
||||||
commonpairs = fromIntegral $ S.size $ S.intersection pairs1 pairs2
|
|
||||||
totalpairs = fromIntegral $ S.size pairs1 + S.size pairs2
|
|
||||||
|
|
||||||
wordLetterPairs = concatMap letterPairs . words
|
|
||||||
|
|
||||||
letterPairs (a:b:rest) = [a,b] : letterPairs (b:rest)
|
|
||||||
letterPairs _ = []
|
|
||||||
|
|||||||
@ -32,7 +32,6 @@ import Hledger.Read.CsvReader (CSV, printCSV)
|
|||||||
import Hledger
|
import Hledger
|
||||||
import Hledger.Cli.CliOptions
|
import Hledger.Cli.CliOptions
|
||||||
import Hledger.Cli.Utils
|
import Hledger.Cli.Utils
|
||||||
import Hledger.Cli.Commands.Add ( transactionsSimilarTo )
|
|
||||||
|
|
||||||
|
|
||||||
printmode = hledgerCommandMode
|
printmode = hledgerCommandMode
|
||||||
@ -194,16 +193,7 @@ postingToCSV p =
|
|||||||
-- | Print the transaction most closely and recently matching a description
|
-- | Print the transaction most closely and recently matching a description
|
||||||
-- (and the query, if any).
|
-- (and the query, if any).
|
||||||
printMatch :: CliOpts -> Journal -> Text -> IO ()
|
printMatch :: CliOpts -> Journal -> Text -> IO ()
|
||||||
printMatch CliOpts{reportspec_=rspec} j desc = do
|
printMatch opts j desc = do
|
||||||
case similarTransaction' j (rsQuery rspec) desc of
|
case journalSimilarTransaction opts j desc of
|
||||||
Nothing -> putStrLn "no matches found."
|
Nothing -> putStrLn "no matches found."
|
||||||
Just t -> T.putStr $ showTransaction t
|
Just t -> T.putStr $ showTransaction t
|
||||||
|
|
||||||
where
|
|
||||||
-- Identify the closest recent match for this description in past transactions.
|
|
||||||
similarTransaction' :: Journal -> Query -> Text -> Maybe Transaction
|
|
||||||
similarTransaction' j q desc
|
|
||||||
| null historymatches = Nothing
|
|
||||||
| otherwise = Just $ snd $ head historymatches
|
|
||||||
where
|
|
||||||
historymatches = transactionsSimilarTo j q desc
|
|
||||||
|
|||||||
@ -26,6 +26,7 @@ module Hledger.Cli.Utils
|
|||||||
pivotByOpts,
|
pivotByOpts,
|
||||||
anonymiseByOpts,
|
anonymiseByOpts,
|
||||||
utcTimeToClockTime,
|
utcTimeToClockTime,
|
||||||
|
journalSimilarTransaction,
|
||||||
tests_Cli_Utils,
|
tests_Cli_Utils,
|
||||||
)
|
)
|
||||||
where
|
where
|
||||||
@ -38,7 +39,7 @@ import qualified Data.Text.IO as T
|
|||||||
import qualified Data.Text.Lazy as TL
|
import qualified Data.Text.Lazy as TL
|
||||||
import qualified Data.Text.Lazy.IO as TL
|
import qualified Data.Text.Lazy.IO as TL
|
||||||
import Data.Time (UTCTime, Day, addDays)
|
import Data.Time (UTCTime, Day, addDays)
|
||||||
import Safe (readMay)
|
import Safe (readMay, headMay)
|
||||||
import System.Console.CmdArgs
|
import System.Console.CmdArgs
|
||||||
import System.Directory (getModificationTime, getDirectoryContents, copyFile, doesFileExist)
|
import System.Directory (getModificationTime, getDirectoryContents, copyFile, doesFileExist)
|
||||||
import System.Exit
|
import System.Exit
|
||||||
@ -302,6 +303,17 @@ backupNumber f g = case g =~ ("^" ++ f ++ "\\.([0-9]+)$") of
|
|||||||
(_::FilePath, _::FilePath, _::FilePath, [ext::FilePath]) -> readMay ext
|
(_::FilePath, _::FilePath, _::FilePath, [ext::FilePath]) -> readMay ext
|
||||||
_ -> Nothing
|
_ -> Nothing
|
||||||
|
|
||||||
|
-- Identify the closest recent match for this description in past transactions.
|
||||||
|
-- If the options specify a query, only matched transactions are considered.
|
||||||
|
journalSimilarTransaction :: CliOpts -> Journal -> T.Text -> Maybe Transaction
|
||||||
|
journalSimilarTransaction cliopts j desc = mbestmatch
|
||||||
|
where
|
||||||
|
mbestmatch = snd <$> headMay bestmatches
|
||||||
|
bestmatches =
|
||||||
|
dbg1With (unlines . ("similar transactions:":) . map (\(score,Transaction{..}) -> printf "%0.3f %s %s" score (show tdate) tdescription)) $
|
||||||
|
journalTransactionsSimilarTo j q desc 10
|
||||||
|
q = queryFromFlags $ rsOpts $ reportspec_ cliopts
|
||||||
|
|
||||||
tests_Cli_Utils = tests "Utils" [
|
tests_Cli_Utils = tests "Utils" [
|
||||||
|
|
||||||
-- tests "journalApplyValue" [
|
-- tests "journalApplyValue" [
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user