add, lib: debug output, refactor similar transactions lookup
add --debug=1 shows the top hits for similar past transactions. added: Hledger.Cli.Utils.journalSimilarTransaction provides --debug=1 output changed: Hledger.Cli.Commands.Add.transactionsSimilarTo -> Hledger.Data.Journal.journalTransactionsSimilarTo now takes an extra number-of-results argument
This commit is contained in:
		
							parent
							
								
									7979c7d74c
								
							
						
					
					
						commit
						44508f58c8
					
				| @ -67,6 +67,7 @@ module Hledger.Data.Journal ( | |||||||
|   journalNextTransaction, |   journalNextTransaction, | ||||||
|   journalPrevTransaction, |   journalPrevTransaction, | ||||||
|   journalPostings, |   journalPostings, | ||||||
|  |   journalTransactionsSimilarTo, | ||||||
|   -- journalPrices, |   -- journalPrices, | ||||||
|   -- * Standard account types |   -- * Standard account types | ||||||
|   journalBalanceSheetAccountQuery, |   journalBalanceSheetAccountQuery, | ||||||
| @ -96,6 +97,7 @@ import "extra" Control.Monad.Extra (whenM) | |||||||
| import Control.Monad.Reader as R | import Control.Monad.Reader as R | ||||||
| import Control.Monad.ST (ST, runST) | import Control.Monad.ST (ST, runST) | ||||||
| import Data.Array.ST (STArray, getElems, newListArray, writeArray) | import Data.Array.ST (STArray, getElems, newListArray, writeArray) | ||||||
|  | import Data.Char (toUpper) | ||||||
| import Data.Default (Default(..)) | import Data.Default (Default(..)) | ||||||
| import Data.Function ((&)) | import Data.Function ((&)) | ||||||
| import qualified Data.HashTable.Class as H (toList) | import qualified Data.HashTable.Class as H (toList) | ||||||
| @ -125,6 +127,7 @@ import Hledger.Data.Transaction | |||||||
| import Hledger.Data.TransactionModifier | import Hledger.Data.TransactionModifier | ||||||
| import Hledger.Data.Posting | import Hledger.Data.Posting | ||||||
| import Hledger.Query | import Hledger.Query | ||||||
|  | import Data.List (sortBy) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| -- try to make Journal ppShow-compatible | -- try to make Journal ppShow-compatible | ||||||
| @ -321,6 +324,55 @@ journalAccountNames = journalAccountNamesDeclaredOrImplied | |||||||
| journalAccountNameTree :: Journal -> Tree AccountName | journalAccountNameTree :: Journal -> Tree AccountName | ||||||
| journalAccountNameTree = accountNameTreeFrom . journalAccountNames | journalAccountNameTree = accountNameTreeFrom . journalAccountNames | ||||||
| 
 | 
 | ||||||
|  | -- | Find up to N most similar and most recent transactions matching | ||||||
|  | -- the given transaction description and query. Transactions are | ||||||
|  | -- listed with their description's similarity score (see | ||||||
|  | -- compareDescriptions), sorted by highest score and then by date. | ||||||
|  | -- Only transactions with a similarity score greater than the minimum | ||||||
|  | -- threshold (currently 0) are returned. | ||||||
|  | journalTransactionsSimilarTo :: Journal -> Query -> Text -> Int -> [(Double,Transaction)] | ||||||
|  | journalTransactionsSimilarTo j q desc n = | ||||||
|  |   take n $ | ||||||
|  |   sortBy (\(s1,t1) (s2,t2) -> compare (s2,tdate t2) (s1,tdate t1)) $ | ||||||
|  |   filter ((> threshold).fst) | ||||||
|  |   [(compareDescriptions desc $ tdescription t, t) | t <- ts] | ||||||
|  |   where | ||||||
|  |     ts = filter (q `matchesTransaction`) $ jtxns j | ||||||
|  |     threshold = 0 | ||||||
|  | 
 | ||||||
|  | -- | Return a similarity measure, from 0 to 1, for two transaction | ||||||
|  | -- descriptions.  This is like compareStrings, but first strips out | ||||||
|  | -- any numbers, to improve accuracy eg when there are bank transaction | ||||||
|  | -- ids from imported data. | ||||||
|  | compareDescriptions :: Text -> Text -> Double | ||||||
|  | compareDescriptions s t = compareStrings s' t' | ||||||
|  |     where s' = simplify $ T.unpack s | ||||||
|  |           t' = simplify $ T.unpack t | ||||||
|  |           simplify = filter (not . (`elem` ("0123456789" :: String))) | ||||||
|  | 
 | ||||||
|  | -- | Return a similarity measure, from 0 to 1, for two strings.  This | ||||||
|  | -- was based on Simon White's string similarity algorithm | ||||||
|  | -- (http://www.catalysoft.com/articles/StrikeAMatch.html), later found | ||||||
|  | -- to be https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient, | ||||||
|  | -- modified to handle short strings better. | ||||||
|  | -- Todo: check out http://nlp.fi.muni.cz/raslan/2008/raslan08.pdf#page=14 . | ||||||
|  | compareStrings :: String -> String -> Double | ||||||
|  | compareStrings "" "" = 1 | ||||||
|  | compareStrings [_] "" = 0 | ||||||
|  | compareStrings "" [_] = 0 | ||||||
|  | compareStrings [a] [b] = if toUpper a == toUpper b then 1 else 0 | ||||||
|  | compareStrings s1 s2 = 2 * commonpairs / totalpairs | ||||||
|  |     where | ||||||
|  |       pairs1      = S.fromList $ wordLetterPairs $ uppercase s1 | ||||||
|  |       pairs2      = S.fromList $ wordLetterPairs $ uppercase s2 | ||||||
|  |       commonpairs = fromIntegral $ S.size $ S.intersection pairs1 pairs2 | ||||||
|  |       totalpairs  = fromIntegral $ S.size pairs1 + S.size pairs2 | ||||||
|  | 
 | ||||||
|  | wordLetterPairs = concatMap letterPairs . words | ||||||
|  | 
 | ||||||
|  | letterPairs (a:b:rest) = [a,b] : letterPairs (b:rest) | ||||||
|  | letterPairs _ = [] | ||||||
|  | 
 | ||||||
| -- queries for standard account types | -- queries for standard account types | ||||||
| 
 | 
 | ||||||
| -- | Get a query for accounts of the specified types in this journal.  | -- | Get a query for accounts of the specified types in this journal.  | ||||||
|  | |||||||
| @ -11,7 +11,6 @@ module Hledger.Cli.Commands.Add ( | |||||||
|   ,add |   ,add | ||||||
|   ,appendToJournalFileOrStdout |   ,appendToJournalFileOrStdout | ||||||
|   ,journalAddTransaction |   ,journalAddTransaction | ||||||
|   ,transactionsSimilarTo |  | ||||||
| ) | ) | ||||||
| where | where | ||||||
| 
 | 
 | ||||||
| @ -26,7 +25,6 @@ import Data.Char (toUpper, toLower) | |||||||
| import Data.Either (isRight) | import Data.Either (isRight) | ||||||
| import Data.Functor.Identity (Identity(..)) | import Data.Functor.Identity (Identity(..)) | ||||||
| import "base-compat-batteries" Data.List.Compat | import "base-compat-batteries" Data.List.Compat | ||||||
| import qualified Data.Set as S |  | ||||||
| import Data.Maybe (fromJust, fromMaybe, isJust) | import Data.Maybe (fromJust, fromMaybe, isJust) | ||||||
| import Data.Text (Text) | import Data.Text (Text) | ||||||
| import qualified Data.Text as T | import qualified Data.Text as T | ||||||
| @ -49,6 +47,7 @@ import Text.Printf | |||||||
| import Hledger | import Hledger | ||||||
| import Hledger.Cli.CliOptions | import Hledger.Cli.CliOptions | ||||||
| import Hledger.Cli.Commands.Register (postingsReportAsText) | import Hledger.Cli.Commands.Register (postingsReportAsText) | ||||||
|  | import Hledger.Cli.Utils (journalSimilarTransaction) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| addmode = hledgerCommandMode | addmode = hledgerCommandMode | ||||||
| @ -176,7 +175,7 @@ confirmedTransactionWizard prevInput es@EntryState{..} stack@(currentStage : _) | |||||||
| 
 | 
 | ||||||
|   EnterDescAndComment (date, code) -> descriptionAndCommentWizard prevInput es >>= \case |   EnterDescAndComment (date, code) -> descriptionAndCommentWizard prevInput es >>= \case | ||||||
|     Just (desc, comment) -> do |     Just (desc, comment) -> do | ||||||
|       let mbaset = similarTransaction es desc |       let mbaset = journalSimilarTransaction esOpts esJournal desc | ||||||
|           es' = es |           es' = es | ||||||
|             { esArgs = drop 1 esArgs |             { esArgs = drop 1 esArgs | ||||||
|             , esPostings = [] |             , esPostings = [] | ||||||
| @ -258,15 +257,6 @@ confirmedTransactionWizard prevInput es@EntryState{..} stack@(currentStage : _) | |||||||
|   where |   where | ||||||
|     replaceNthOrAppend n newElem xs = take n xs ++ [newElem] ++ drop (n + 1) xs |     replaceNthOrAppend n newElem xs = take n xs ++ [newElem] ++ drop (n + 1) xs | ||||||
| 
 | 
 | ||||||
| -- Identify the closest recent match for this description in past transactions. |  | ||||||
| similarTransaction :: EntryState -> Text -> Maybe Transaction |  | ||||||
| similarTransaction EntryState{..} desc = |  | ||||||
|   let q = queryFromFlags . rsOpts $ reportspec_ esOpts |  | ||||||
|       historymatches = transactionsSimilarTo esJournal q desc |  | ||||||
|       bestmatch | null historymatches = Nothing |  | ||||||
|                 | otherwise           = Just $ snd $ head historymatches |  | ||||||
|   in bestmatch |  | ||||||
| 
 |  | ||||||
| dateAndCodeWizard PrevInput{..} EntryState{..} = do | dateAndCodeWizard PrevInput{..} EntryState{..} = do | ||||||
|   let def = headDef (T.unpack $ showDate esDefDate) esArgs |   let def = headDef (T.unpack $ showDate esDefDate) esArgs | ||||||
|   retryMsg "A valid hledger smart date is required. Eg: 2014/2/14, 14, yesterday." $ |   retryMsg "A valid hledger smart date is required. Eg: 2014/2/14, 14, yesterday." $ | ||||||
| @ -482,49 +472,3 @@ registerFromString s = do | |||||||
| capitalize :: String -> String | capitalize :: String -> String | ||||||
| capitalize "" = "" | capitalize "" = "" | ||||||
| capitalize (c:cs) = toUpper c : cs | capitalize (c:cs) = toUpper c : cs | ||||||
| 
 |  | ||||||
| -- | Find the most similar and recent transactions matching the given |  | ||||||
| -- transaction description and report query.  Transactions are listed |  | ||||||
| -- with their "relevancy" score, most relevant first. |  | ||||||
| transactionsSimilarTo :: Journal -> Query -> Text -> [(Double,Transaction)] |  | ||||||
| transactionsSimilarTo j q desc = |  | ||||||
|     sortBy compareRelevanceAndRecency |  | ||||||
|                $ filter ((> threshold).fst) |  | ||||||
|                [(compareDescriptions desc $ tdescription t, t) | t <- ts] |  | ||||||
|     where |  | ||||||
|       compareRelevanceAndRecency (n1,t1) (n2,t2) = compare (n2,tdate t2) (n1,tdate t1) |  | ||||||
|       ts = filter (q `matchesTransaction`) $ jtxns j |  | ||||||
|       threshold = 0 |  | ||||||
| 
 |  | ||||||
| -- | Return a similarity measure, from 0 to 1, for two transaction |  | ||||||
| -- descriptions.  This is like compareStrings, but first strips out |  | ||||||
| -- any numbers, to improve accuracy eg when there are bank transaction |  | ||||||
| -- ids from imported data. |  | ||||||
| compareDescriptions :: Text -> Text -> Double |  | ||||||
| compareDescriptions s t = compareStrings s' t' |  | ||||||
|     where s' = simplify $ T.unpack s |  | ||||||
|           t' = simplify $ T.unpack t |  | ||||||
|           simplify = filter (not . (`elem` ("0123456789" :: String))) |  | ||||||
| 
 |  | ||||||
| -- | Return a similarity measure, from 0 to 1, for two strings.  This |  | ||||||
| -- was based on Simon White's string similarity algorithm |  | ||||||
| -- (http://www.catalysoft.com/articles/StrikeAMatch.html), later found |  | ||||||
| -- to be https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient, |  | ||||||
| -- modified to handle short strings better. |  | ||||||
| -- Todo: check out http://nlp.fi.muni.cz/raslan/2008/raslan08.pdf#page=14 . |  | ||||||
| compareStrings :: String -> String -> Double |  | ||||||
| compareStrings "" "" = 1 |  | ||||||
| compareStrings [_] "" = 0 |  | ||||||
| compareStrings "" [_] = 0 |  | ||||||
| compareStrings [a] [b] = if toUpper a == toUpper b then 1 else 0 |  | ||||||
| compareStrings s1 s2 = 2 * commonpairs / totalpairs |  | ||||||
|     where |  | ||||||
|       pairs1      = S.fromList $ wordLetterPairs $ uppercase s1 |  | ||||||
|       pairs2      = S.fromList $ wordLetterPairs $ uppercase s2 |  | ||||||
|       commonpairs = fromIntegral $ S.size $ S.intersection pairs1 pairs2 |  | ||||||
|       totalpairs  = fromIntegral $ S.size pairs1 + S.size pairs2 |  | ||||||
| 
 |  | ||||||
| wordLetterPairs = concatMap letterPairs . words |  | ||||||
| 
 |  | ||||||
| letterPairs (a:b:rest) = [a,b] : letterPairs (b:rest) |  | ||||||
| letterPairs _ = [] |  | ||||||
|  | |||||||
| @ -32,7 +32,6 @@ import Hledger.Read.CsvReader (CSV, printCSV) | |||||||
| import Hledger | import Hledger | ||||||
| import Hledger.Cli.CliOptions | import Hledger.Cli.CliOptions | ||||||
| import Hledger.Cli.Utils | import Hledger.Cli.Utils | ||||||
| import Hledger.Cli.Commands.Add ( transactionsSimilarTo ) |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| printmode = hledgerCommandMode | printmode = hledgerCommandMode | ||||||
| @ -194,16 +193,7 @@ postingToCSV p = | |||||||
| -- | Print the transaction most closely and recently matching a description | -- | Print the transaction most closely and recently matching a description | ||||||
| -- (and the query, if any). | -- (and the query, if any). | ||||||
| printMatch :: CliOpts -> Journal -> Text -> IO () | printMatch :: CliOpts -> Journal -> Text -> IO () | ||||||
| printMatch CliOpts{reportspec_=rspec} j desc = do | printMatch opts j desc = do | ||||||
|   case similarTransaction' j (rsQuery rspec) desc of |   case journalSimilarTransaction opts j desc of | ||||||
|       Nothing -> putStrLn "no matches found." |     Nothing -> putStrLn "no matches found." | ||||||
|       Just t  -> T.putStr $ showTransaction t |     Just t  -> T.putStr $ showTransaction t | ||||||
| 
 |  | ||||||
|   where |  | ||||||
|     -- Identify the closest recent match for this description in past transactions. |  | ||||||
|     similarTransaction' :: Journal -> Query -> Text -> Maybe Transaction |  | ||||||
|     similarTransaction' j q desc |  | ||||||
|       | null historymatches = Nothing |  | ||||||
|       | otherwise           = Just $ snd $ head historymatches |  | ||||||
|       where |  | ||||||
|         historymatches = transactionsSimilarTo j q desc |  | ||||||
|  | |||||||
| @ -26,6 +26,7 @@ module Hledger.Cli.Utils | |||||||
|      pivotByOpts, |      pivotByOpts, | ||||||
|      anonymiseByOpts, |      anonymiseByOpts, | ||||||
|      utcTimeToClockTime, |      utcTimeToClockTime, | ||||||
|  |      journalSimilarTransaction, | ||||||
|      tests_Cli_Utils, |      tests_Cli_Utils, | ||||||
|     ) |     ) | ||||||
| where | where | ||||||
| @ -38,7 +39,7 @@ import qualified Data.Text.IO as T | |||||||
| import qualified Data.Text.Lazy as TL | import qualified Data.Text.Lazy as TL | ||||||
| import qualified Data.Text.Lazy.IO as TL | import qualified Data.Text.Lazy.IO as TL | ||||||
| import Data.Time (UTCTime, Day, addDays) | import Data.Time (UTCTime, Day, addDays) | ||||||
| import Safe (readMay) | import Safe (readMay, headMay) | ||||||
| import System.Console.CmdArgs | import System.Console.CmdArgs | ||||||
| import System.Directory (getModificationTime, getDirectoryContents, copyFile, doesFileExist) | import System.Directory (getModificationTime, getDirectoryContents, copyFile, doesFileExist) | ||||||
| import System.Exit | import System.Exit | ||||||
| @ -302,6 +303,17 @@ backupNumber f g = case g =~ ("^" ++ f ++ "\\.([0-9]+)$") of | |||||||
|                         (_::FilePath, _::FilePath, _::FilePath, [ext::FilePath]) -> readMay ext |                         (_::FilePath, _::FilePath, _::FilePath, [ext::FilePath]) -> readMay ext | ||||||
|                         _ -> Nothing |                         _ -> Nothing | ||||||
| 
 | 
 | ||||||
|  | -- Identify the closest recent match for this description in past transactions. | ||||||
|  | -- If the options specify a query, only matched transactions are considered. | ||||||
|  | journalSimilarTransaction :: CliOpts -> Journal -> T.Text -> Maybe Transaction | ||||||
|  | journalSimilarTransaction cliopts j desc = mbestmatch | ||||||
|  |   where | ||||||
|  |     mbestmatch = snd <$> headMay bestmatches | ||||||
|  |     bestmatches = | ||||||
|  |       dbg1With (unlines . ("similar transactions:":) . map (\(score,Transaction{..}) -> printf "%0.3f %s %s" score (show tdate) tdescription)) $ | ||||||
|  |       journalTransactionsSimilarTo j q desc 10 | ||||||
|  |     q = queryFromFlags $ rsOpts $ reportspec_ cliopts | ||||||
|  | 
 | ||||||
| tests_Cli_Utils = tests "Utils" [ | tests_Cli_Utils = tests "Utils" [ | ||||||
| 
 | 
 | ||||||
|   --  tests "journalApplyValue" [ |   --  tests "journalApplyValue" [ | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user