From 01387548e7c772079633eb43172c75e465569183 Mon Sep 17 00:00:00 2001 From: Simon Michael Date: Thu, 6 Oct 2022 22:21:55 -1000 Subject: [PATCH] feat: csv: intra-day-reversed compensates when days' txns are reversed As in eg vanguard CSV. --- hledger-lib/Hledger/Read/CsvReader.hs | 49 +++++++++++++++------------ hledger/hledger.m4.md | 21 +++++++++++- 2 files changed, 48 insertions(+), 22 deletions(-) diff --git a/hledger-lib/Hledger/Read/CsvReader.hs b/hledger-lib/Hledger/Read/CsvReader.hs index d71906308..51cb1dcce 100644 --- a/hledger-lib/Hledger/Read/CsvReader.hs +++ b/hledger-lib/Hledger/Read/CsvReader.hs @@ -45,10 +45,10 @@ import Control.Monad.Trans.Class (lift) import Data.Char (toLower, isDigit, isSpace, isAlphaNum, ord) import Data.Bifunctor (first) import Data.Functor ((<&>)) -import Data.List (elemIndex, foldl', intersperse, mapAccumL, nub, sortBy) +import Data.List (elemIndex, foldl', intersperse, mapAccumL, nub, sortOn) +import Data.List.Extra (groupOn) import Data.Maybe (catMaybes, fromMaybe, isJust) import Data.MemoUgly (memo) -import Data.Ord (comparing) import qualified Data.Set as S import Data.Text (Text) import qualified Data.Text as T @@ -358,7 +358,7 @@ Grammar for the CSV conversion rules, more or less: RULES: RULE* -RULE: ( FIELD-LIST | FIELD-ASSIGNMENT | CONDITIONAL-BLOCK | SKIP | NEWEST-FIRST | DATE-FORMAT | DECIMAL-MARK | COMMENT | BLANK ) NEWLINE +RULE: ( FIELD-LIST | FIELD-ASSIGNMENT | CONDITIONAL-BLOCK | SKIP | TIMEZONE | NEWEST-FIRST | INTRA-DAY-REVERSED | DATE-FORMAT | DECIMAL-MARK | COMMENT | BLANK ) NEWLINE FIELD-LIST: fields SPACE FIELD-NAME ( SPACE? , SPACE? FIELD-NAME )* @@ -462,6 +462,7 @@ directives = ,"skip" ,"timezone" ,"newest-first" + ,"intra-day-reversed" , "balance-type" ] @@ -750,29 +751,35 @@ readJournalFromCsv mrulesfile csvfile csvdata = do Just f | any (`T.isInfixOf` f) ["%Z","%z","%EZ","%Ez"] -> True _ -> False - -- Ensure transactions are ordered chronologically. - -- First, if the CSV records seem to be most-recent-first (because - -- there's an explicit "newest-first" directive, or there's more - -- than one date and the first date is more recent than the last): - -- reverse them to get same-date transactions ordered chronologically. - txns' = - (if newestfirst || mdataseemsnewestfirst == Just True - then dbg7 "reversed csv txns" . reverse else id) - txns - where - newestfirst = dbg6 "newestfirst" $ isJust $ getDirective "newest-first" rules - mdataseemsnewestfirst = dbg6 "mdataseemsnewestfirst" $ - case nub $ map tdate txns of - ds | length ds > 1 -> Just $ head ds > last ds - _ -> Nothing - -- Second, sort by date. - txns'' = dbg7 "date-sorted csv txns" $ sortBy (comparing tdate) txns' + -- Do our best to ensure transactions will be ordered chronologically, + -- from oldest to newest. This is done in several steps: + -- 1. Intra-day order: if there's an "intra-day-reversed" rule, + -- assume each day's CSV records were ordered in reverse of the overall date order, + -- so reverse each day's txns. + intradayreversed = dbg6 "intra-day-reversed" $ isJust $ getDirective "intra-day-reversed" rules + txns1 = dbg7 "txns1" $ + (if intradayreversed then concatMap reverse . groupOn tdate else id) txns + -- 2. Overall date order: now if there's a "newest-first" rule, + -- or if there's multiple dates and the first is more recent than the last, + -- assume CSV records were ordered newest dates first, + -- so reverse all txns. + newestfirst = dbg6 "newest-first" $ isJust $ getDirective "newest-first" rules + mdatalooksnewestfirst = dbg6 "mdatalooksnewestfirst" $ + case nub $ map tdate txns of + ds | length ds > 1 -> Just $ head ds > last ds + _ -> Nothing + txns2 = dbg7 "txns2" $ + (if newestfirst || mdatalooksnewestfirst == Just True then reverse else id) txns1 + -- 3. Disordered dates: in case the CSV records were ordered by chaos, + -- do a final sort by date. If it was only a few records out of order, + -- this will hopefully refine any good ordering done by steps 1 and 2. + txns3 = dbg7 "date-sorted csv txns" $ sortOn tdate txns2 liftIO $ unless rulesfileexists $ do dbg1IO "creating conversion rules file" rulesfile T.writeFile rulesfile rulestext - return nulljournal{jtxns=txns''} + return nulljournal{jtxns=txns3} -- | Parse special separator names TAB and SPACE, or return the first -- character. Return Nothing on empty string diff --git a/hledger/hledger.m4.md b/hledger/hledger.m4.md index 49e257c20..ec1f86e7b 100644 --- a/hledger/hledger.m4.md +++ b/hledger/hledger.m4.md @@ -3910,7 +3910,8 @@ these are described more fully below, after the examples: | [**`end`**](#end) | skip the remaining CSV records | | [**`date-format`**](#date-format) | how to parse dates in CSV records | | [**`decimal-mark`**](#decimal-mark) | the decimal mark used in CSV amounts, if ambiguous | -| [**`newest-first`**](#newest-first) | disambiguate record order when there's only one date | +| [**`newest-first`**](#newest-first) | improve txn order when there are multiple records, newest first, all with the same date | +| [**`intra-day-reversed`**](#intra-day-reversed) | improve txn order when each day's txns are reverse of the overall date order | | [**`include`**](#include) | inline another CSV rules file | | [**`balance-type`**](#balance-type) | choose which type of balance assignments to use | @@ -4674,6 +4675,24 @@ then, you should add the `newest-first` rule as a hint. Eg: newest-first ``` +### `intra-day-reversed` + +CSV records for each day are sometimes ordered in reverse compared to the overall date order. +Eg, here dates are newest first, but the transactions on each date are oldest first: +```csv +2022-10-02, txn 3... +2022-10-02, txn 4... +2022-10-01, txn 1... +2022-10-01, txn 2... +``` +In this situation, add the `intra-day-reversed` rule, and hledger will compensate, +improving the order of transactions. +```rules +# transactions within each day are reversed, so reverse them back +intra-day-reversed +``` + + ### `include`