diff --git a/hledger-lib/Hledger/Read/CsvReader.hs b/hledger-lib/Hledger/Read/CsvReader.hs index 83c594e19..ef7e33bfa 100644 --- a/hledger-lib/Hledger/Read/CsvReader.hs +++ b/hledger-lib/Hledger/Read/CsvReader.hs @@ -3,6 +3,10 @@ A reader for CSV data, using an extra rules file to help interpret the data. -} +-- Lots of haddocks in this file are for non-exported types. +-- Here's a command that will render them: +-- stack haddock hledger-lib --fast --no-haddock-deps --haddock-arguments='--ignore-all-exports' --open + {-# LANGUAGE CPP #-} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE ScopedTypeVariables #-} @@ -12,6 +16,8 @@ A reader for CSV data, using an extra rules file to help interpret the data. {-# LANGUAGE TypeSynonymInstances #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE PackageImports #-} +{-# LANGUAGE MultiWayIf #-} +{-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE RecordWildCards #-} module Hledger.Read.CsvReader ( @@ -115,15 +121,19 @@ parseSeparator = specials . map toLower -- | Read a Journal from the given CSV data (and filename, used for error -- messages), or return an error. Proceed as follows: --- @ +-- -- 1. parse CSV conversion rules from the specified rules file, or from -- the default rules file for the specified CSV file, if it exists, -- or throw a parse error; if it doesn't exist, use built-in default rules +-- -- 2. parse the CSV data, or throw a parse error +-- -- 3. convert the CSV records to transactions using the rules +-- -- 4. if the rules file didn't exist, create it with the default rules and filename +-- -- 5. return the transactions as a Journal --- @ +-- readJournalFromCsv :: Maybe FilePath -> FilePath -> Text -> IO (Either String Journal) readJournalFromCsv Nothing "-" _ = return $ Left "please use --rules-file when reading CSV from stdin" readJournalFromCsv mrulesfile csvfile csvdata = @@ -379,30 +389,44 @@ convert a particular CSV data file into meaningful journal transactions. -} data CsvRules = CsvRules { rdirectives :: [(DirectiveName,String)], + -- ^ top-level rules, as (keyword, value) pairs rcsvfieldindexes :: [(CsvFieldName, CsvFieldIndex)], + -- ^ csv field names and their column number, if declared by a fields list rassignments :: [(JournalFieldName, FieldTemplate)], + -- ^ top-level assignments to hledger fields, as (field name, value template) pairs rconditionalblocks :: [ConditionalBlock] + -- ^ conditional blocks, which containing additional assignments/rules to apply to matched csv records } deriving (Show, Eq) type CsvRulesParser a = StateT CsvRules SimpleTextParser a +-- | The keyword of a CSV rule - "fields", "skip", "if", etc. type DirectiveName = String +-- | CSV field name. type CsvFieldName = String +-- | 1-based CSV column number. type CsvFieldIndex = Int -type JournalFieldName = String +-- | Percent symbol followed by a CSV field name or column number. Eg: %date, %1. +type CsvFieldReference = String +-- | One of the standard hledger field names that can be assigned to. +type JournalFieldName = String -- XXX rename to HledgerFieldName +-- | A text value to be assigned to a hledger field, possibly +-- containing csv field references to be interpolated. type FieldTemplate = String +-- | A strptime date parsing pattern, as supported by Data.Time.Format. type DateFormat = String +-- | A regular expression. type RegexpPattern = String -- | A single test for matching a CSV record, in one way or another. data Matcher = RecordMatcher RegexpPattern -- ^ match if this regexp matches the overall CSV record - -- | FieldMatcher CsvFieldName RegexpPattern -- ^ match if this regexp matches the named CSV field + | FieldMatcher CsvFieldReference RegexpPattern -- ^ match if this regexp matches the referenced CSV field's value deriving (Show, Eq) -- | A conditional block: a set of CSV record matchers, and a sequence --- of zero or more rules which will be enabled only when one or of the --- matchers succeeds. +-- of rules which will be enabled only if one or more of the matchers +-- succeeds. -- -- Three types of rule are allowed inside conditional blocks: field -- assignments, skip, end. (A skip or end rule is stored as if it was @@ -634,6 +658,7 @@ fieldvalp = do lift $ dbgparse 2 "trying fieldvalp" anySingle `manyTill` lift eolof +-- A conditional block: one or more matchers, one per line, followed by one or more indented rules. conditionalblockp :: CsvRulesParser ConditionalBlock conditionalblockp = do lift $ dbgparse 3 "trying conditionalblockp" @@ -645,10 +670,14 @@ conditionalblockp = do return $ CB{cbMatchers=ms, cbAssignments=as} "conditional block" --- A single matcher, on one line --- XXX Currently only parses a RecordMatcher +-- A single matcher, on one line. matcherp :: CsvRulesParser Matcher -matcherp = do +matcherp = try fieldmatcherp <|> recordmatcherp + +-- A single whole-record matcher. +-- A pattern on the whole line, not containing any of the match operators (~). +recordmatcherp :: CsvRulesParser Matcher +recordmatcherp = do lift $ dbgparse 2 "trying matcherp" -- pos <- currentPos _ <- optional (matchoperatorp >> lift (skipMany spacenonewline) >> optional newline) @@ -658,8 +687,35 @@ matcherp = do return $ RecordMatcher r "record matcher" --- An operator indicating the type of match --- XXX currently only ~ (regex), unused +-- | A single matcher for a specific field. A csv field reference (like %date or %1), +-- a match operator (~), and a pattern on the rest of the line, optionally +-- space-separated. Eg: +-- %description ~ chez jacques +fieldmatcherp :: CsvRulesParser Matcher +fieldmatcherp = do + lift $ dbgparse 2 "trying fieldmatcher" + -- An optional fieldname (default: "all") + -- f <- fromMaybe "all" `fmap` (optional $ do + -- f' <- fieldnamep + -- lift (skipMany spacenonewline) + -- return f') + f <- csvfieldreferencep <* lift (skipMany spacenonewline) + -- optional operator.. just ~ (case insensitive infix regex) for now + _op <- fromMaybe "~" <$> optional matchoperatorp + lift (skipMany spacenonewline) + r <- regexp + return $ FieldMatcher f r + "field matcher" + +csvfieldreferencep :: CsvRulesParser CsvFieldReference +csvfieldreferencep = do + lift $ dbgparse 3 "trying csvfieldreferencep" + char '%' + f <- fieldnamep + return $ '%' : quoteIfNeeded f + +-- A match operator, indicating the type of match to perform. +-- Currently just ~ meaning case insensitive infix regex match. matchoperatorp :: CsvRulesParser String matchoperatorp = fmap T.unpack $ choiceInState $ map string ["~" @@ -677,19 +733,6 @@ regexp = do cs <- anySingle `manyTill` lift eolof return $ strip $ c:cs --- fieldmatcher = do --- dbgparse 2 "trying fieldmatcher" --- f <- fromMaybe "all" `fmap` (optional $ do --- f' <- fieldname --- lift (skipMany spacenonewline) --- return f') --- char '~' --- lift (skipMany spacenonewline) --- ps <- patterns --- let r = "(" ++ intercalate "|" ps ++ ")" --- return (f,r) --- "field matcher" - -------------------------------------------------------------------------------- -- Converting CSV records to journal transactions @@ -962,40 +1005,64 @@ showRecord :: CsvRecord -> String showRecord r = "the CSV record is: "++intercalate "," (map show r) -- | Given the conversion rules, a CSV record and a journal entry field name, find --- the template value ultimately assigned to this field, either at top --- level or in a matching conditional block. Conditional blocks' --- patterns are matched against an approximation of the original CSV --- record: all the field values with commas intercalated. +-- the template value ultimately assigned to this field, if any, +-- by a field assignment at top level or in a conditional block matching this record. +-- +-- Note conditional blocks' patterns are matched against an approximation of the +-- CSV record: all the field values, without enclosing quotes, comma-separated. +-- getEffectiveAssignment :: CsvRules -> CsvRecord -> JournalFieldName -> Maybe FieldTemplate -getEffectiveAssignment rules record f = lastMay $ assignmentsFor f +getEffectiveAssignment rules record f = lastMay $ map snd $ assignments where - assignmentsFor f = map snd $ filter ((==f).fst) $ toplevelassignments ++ conditionalassignments + -- all active assignments to field f, in order + assignments = dbg2 "assignments" $ filter ((==f).fst) $ toplevelassignments ++ conditionalassignments where + -- all top level field assignments toplevelassignments = rassignments rules - conditionalassignments = concatMap cbAssignments $ filter blockMatches $ blocksAssigning f + -- all field assignments in conditional blocks assigning to field f and active for the current csv record + conditionalassignments = concatMap cbAssignments $ filter isBlockActive $ blocksAssigning f where + -- all conditional blocks which can potentially assign field f blocksAssigning f = filter (any ((==f).fst) . cbAssignments) $ rconditionalblocks rules - blockMatches :: ConditionalBlock -> Bool - blockMatches CB{..} = any matcherMatches cbMatchers + -- does this conditional block match the current csv record ? + isBlockActive :: ConditionalBlock -> Bool + isBlockActive CB{..} = any matcherMatches cbMatchers where + -- does this individual matcher match the current csv record ? matcherMatches :: Matcher -> Bool - matcherMatches (RecordMatcher pat) = regexMatchesCI pat csvline + matcherMatches (RecordMatcher pat) = regexMatchesCI pat wholecsvline where - csvline = intercalate "," record - -- matcherMatches (FieldMatcher field pat) = undefined + -- a synthetic whole CSV record to match against; note, it has + -- no quotes enclosing fields, and is always comma-separated, + -- so may differ from the actual record, and may not be valid CSV. + wholecsvline = dbg3 "wholecsvline" $ intercalate "," record + matcherMatches (FieldMatcher csvfieldref pat) = regexMatchesCI pat csvfieldvalue + where + -- the value of the referenced CSV field to match against. + csvfieldvalue = dbg3 "csvfieldvalue" $ replaceCsvFieldReference rules record csvfieldref -- | Render a field assigment's template, possibly interpolating referenced -- CSV field values. Outer whitespace is removed from interpolated values. renderTemplate :: CsvRules -> CsvRecord -> FieldTemplate -> String -renderTemplate rules record t = regexReplaceBy "%[A-z0-9_-]+" replace t - where - replace ('%':pat) = maybe pat (\i -> strip $ atDef "" record (i-1)) mindex - where - mindex | all isDigit pat = readMay pat - | otherwise = lookup (map toLower pat) $ rcsvfieldindexes rules - replace pat = pat +renderTemplate rules record t = regexReplaceBy "%[A-z0-9_-]+" (replaceCsvFieldReference rules record) t --- Parse the date string using the specified date-format, or if unspecified try these default formats: +-- | Replace something that looks like a reference to a csv field ("%date" or "%1) +-- with that field's value. If it doesn't look like a field reference, or if we +-- can't find such a field, leave it unchanged. +replaceCsvFieldReference :: CsvRules -> CsvRecord -> CsvFieldReference -> String +replaceCsvFieldReference rules record s@('%':fieldname) = fromMaybe s $ csvFieldValue rules record fieldname +replaceCsvFieldReference _ _ s = s + +-- | Get the (whitespace-stripped) value of a CSV field, identified by its name or +-- column number, ("date" or "1"), from the given CSV record, if such a field exists. +csvFieldValue :: CsvRules -> CsvRecord -> CsvFieldName -> Maybe String +csvFieldValue rules record fieldname = do + fieldindex <- if | all isDigit fieldname -> readMay fieldname + | otherwise -> lookup (map toLower fieldname) $ rcsvfieldindexes rules + fieldvalue <- strip <$> atMay record (fieldindex-1) + return fieldvalue + +-- | Parse the date string using the specified date-format, or if unspecified try these default formats: -- YYYY/MM/DD, YYYY-MM-DD, YYYY.MM.DD, MM/DD/YYYY (month and day can be 1 or 2 digits, year must be 4). parseDateWithFormatOrDefaultFormats :: Maybe DateFormat -> String -> Maybe Day parseDateWithFormatOrDefaultFormats mformat s = firstJust $ map parsewith formats @@ -1025,26 +1092,60 @@ parseDateWithFormatOrDefaultFormats mformat s = firstJust $ map parsewith format tests_CsvReader = tests "CsvReader" [ tests "parseCsvRules" [ - test"empty file" $ + test "empty file" $ parseCsvRules "unknown" "" @?= Right defrules - ] + ] ,tests "rulesp" [ - test"trailing comments" $ + test "trailing comments" $ parseWithState' defrules rulesp "skip\n# \n#\n" @?= Right defrules{rdirectives = [("skip","")]} - ,test"trailing blank lines" $ + ,test "trailing blank lines" $ parseWithState' defrules rulesp "skip\n\n \n" @?= (Right defrules{rdirectives = [("skip","")]}) - ,test"no final newline" $ + ,test "no final newline" $ parseWithState' defrules rulesp "skip" @?= (Right defrules{rdirectives=[("skip","")]}) - ,test"assignment with empty value" $ + ,test "assignment with empty value" $ parseWithState' defrules rulesp "account1 \nif foo\n account2 foo\n" @?= (Right defrules{rassignments = [("account1","")], rconditionalblocks = [CB{cbMatchers=[RecordMatcher "foo"],cbAssignments=[("account2","foo")]}]}) - ] + ] ,tests "conditionalblockp" [ - test"space after conditional" $ -- #1120 + test "space after conditional" $ -- #1120 parseWithState' defrules conditionalblockp "if a\n account2 b\n \n" @?= (Right $ CB{cbMatchers=[RecordMatcher "a"],cbAssignments=[("account2","b")]}) + + ,tests "csvfieldreferencep" [ + test "number" $ parseWithState' defrules csvfieldreferencep "%1" @?= (Right "%1") + ,test "name" $ parseWithState' defrules csvfieldreferencep "%date" @?= (Right "%date") + ,test "quoted name" $ parseWithState' defrules csvfieldreferencep "%\"csv date\"" @?= (Right "%\"csv date\"") + ] + + ,tests "matcherp" [ + + test "recordmatcherp" $ + parseWithState' defrules matcherp "A A\n" @?= (Right $ RecordMatcher "A A") + + ,test "fieldmatcherp.starts-with-%" $ + parseWithState' defrules matcherp "description A A\n" @?= (Right $ RecordMatcher "description A A") + + ,test "fieldmatcherp" $ + parseWithState' defrules matcherp "%description A A\n" @?= (Right $ FieldMatcher "%description" "A A") + + ,test "fieldmatcherp with operator" $ + parseWithState' defrules matcherp "%description ~ A A\n" @?= (Right $ FieldMatcher "%description" "A A") + + ] + + ,tests "getEffectiveAssignment" [ + let rules = defrules{rcsvfieldindexes=[("csvdate",1)],rassignments=[("date","%csvdate")]} + + in test "toplevel" $ getEffectiveAssignment rules ["a","b"] "date" @?= (Just "%csvdate") + + ,let rules = defrules{rcsvfieldindexes=[("csvdate",1)], rconditionalblocks=[CB [FieldMatcher "%csvdate" "a"] [("date","%csvdate")]]} + in test "conditional" $ getEffectiveAssignment rules ["a","b"] "date" @?= (Just "%csvdate") + + ] + ] - ] + + ] diff --git a/hledger-lib/hledger_csv.m4.md b/hledger-lib/hledger_csv.m4.md index 614b28a22..418e33680 100644 --- a/hledger-lib/hledger_csv.m4.md +++ b/hledger-lib/hledger_csv.m4.md @@ -495,13 +495,13 @@ See also: [File Extension](#file-extension). ## `if` ```rules -if PATTERN +if MATCHER RULE if -PATTERN -PATTERN -PATTERN +MATCHER +MATCHER +MATCHER RULE RULE ``` @@ -510,22 +510,29 @@ Conditional blocks ("if blocks") are a block of rules that are applied only to CSV records which match certain patterns. They are often used for customising account names based on transaction descriptions. -A single pattern can be written on the same line as the "if"; -or multiple patterns can be written on the following lines, non-indented. -Multiple patterns are OR'd (any one of them can match). -Patterns are case-insensitive regular expressions -which try to match anywhere within the whole CSV record -(POSIX extended regular expressions with some additions, see https://hledger.org/hledger.html#regular-expressions). -Note the CSV record they see is close to, but not identical to, the one in the CSV file; -enclosing double quotes will be removed, and the separator character is always comma. - -It's not yet easy to match within a specific field. -If the data does not contain commas, you can hack it with a regular expression like: +Each MATCHER can be a record matcher, which looks like this: ```rules -# match "foo" in the fourth field -if ^([^,]*,){3}foo +REGEX ``` +REGEX is a case-insensitive regular expression which tries to match anywhere within the CSV record. +It is a POSIX extended regular expressions with some additions (see +[Regular expressions](https://hledger.org/hledger.html#regular-expressions) in the hledger manual). +Note: the "CSV record" it is matched against is not the original record, but a synthetic one, +with enclosing double quotes or whitespace removed, and always comma-separated. +(Eg, an SSV record `2020-01-01; "Acme, Inc."; 1,000` appears to REGEX as `2020-01-01,Acme, Inc.,1,000`). + +Or, MATCHER can be a field matcher, like this: +```rules +%CSVFIELD REGEX +``` +which matches just the content of a particular CSV field. +CSVFIELD is a percent sign followed by the field's name or column number, like `%date` or `%1`. + +A single matcher can be written on the same line as the "if"; +or multiple matchers can be written on the following lines, non-indented. +Multiple matchers are OR'd (any one of them can match). + After the patterns there should be one or more rules to apply, all indented by at least one space. Three kinds of rule are allowed in conditional blocks: diff --git a/tests/csv.test b/tests/csv.test index 8a9b6defa..f76515cca 100644 --- a/tests/csv.test +++ b/tests/csv.test @@ -560,8 +560,26 @@ $ ./hledger-csv >=0 +# 27. match a specific field +< +2020-01-01, 1 +2020-01-01, 2 +RULES +fields date, desc +if %desc 1 + description one -## 26. A single unbalanced posting with number other than 1 also should not generate a balancing posting. +$ ./hledger-csv desc:one +2020-01-01 one + +>=0 + +## . +#< +#$ ./hledger-csv +#>=0 + +## . A single unbalanced posting with number other than 1 also should not generate a balancing posting. #< #2019-01-01,1 # @@ -575,7 +593,7 @@ $ ./hledger-csv # #>=0 # -## 27. A single posting that's zero also should not generate a balancing posting. +## . A single posting that's zero also should not generate a balancing posting. #< #2019-01-01,0 # @@ -589,7 +607,7 @@ $ ./hledger-csv # #>=0 -## 28. With a bracketed account name, the auto-generated second posting should also be bracketed. +## . With a bracketed account name, the auto-generated second posting should also be bracketed. #< #2019-01-01,1 # @@ -604,8 +622,6 @@ $ ./hledger-csv # #>=0 - - # . TODO: without --separator gives obscure error # | # 1 | 10/2009/09;Flubber Co🎅;50;