csv: allow single field matching; more docs and tests

2020-02-25 17:54:16 -08:00 · 2020-02-25 17:54:16 -08:00 · d537f1fe07
commit d537f1fe07
parent becd891dd1
3 changed files with 199 additions and 75 deletions
--- a/hledger-lib/Hledger/Read/CsvReader.hs
+++ b/hledger-lib/Hledger/Read/CsvReader.hs
@ -3,6 +3,10 @@
 A reader for CSV data, using an extra rules file to help interpret the data.
 -}
 -- Lots of haddocks in this file are for non-exported types.
 -- Here's a command that will render them:
 -- stack haddock hledger-lib --fast --no-haddock-deps --haddock-arguments='--ignore-all-exports' --open
 {-# LANGUAGE CPP #-}
 {-# LANGUAGE FlexibleContexts #-}
 {-# LANGUAGE ScopedTypeVariables #-}
@ -12,6 +16,8 @@ A reader for CSV data, using an extra rules file to help interpret the data.
 {-# LANGUAGE TypeSynonymInstances #-}
 {-# LANGUAGE FlexibleInstances #-}
 {-# LANGUAGE PackageImports #-}
 {-# LANGUAGE MultiWayIf #-}
 {-# LANGUAGE NamedFieldPuns #-}
 {-# LANGUAGE RecordWildCards #-}
 module Hledger.Read.CsvReader (
@ -115,15 +121,19 @@ parseSeparator = specials . map toLower
 -- | Read a Journal from the given CSV data (and filename, used for error
 -- messages), or return an error. Proceed as follows:
-- @
+--
 -- 1. parse CSV conversion rules from the specified rules file, or from
 --    the default rules file for the specified CSV file, if it exists,
 --    or throw a parse error; if it doesn't exist, use built-in default rules
 --
 -- 2. parse the CSV data, or throw a parse error
 --
 -- 3. convert the CSV records to transactions using the rules
 --
 -- 4. if the rules file didn't exist, create it with the default rules and filename
 --
 -- 5. return the transactions as a Journal
-- @
+-- 
 readJournalFromCsv :: Maybe FilePath -> FilePath -> Text -> IO (Either String Journal)
 readJournalFromCsv Nothing "-" _ = return $ Left "please use --rules-file when reading CSV from stdin"
 readJournalFromCsv mrulesfile csvfile csvdata =
@ -379,30 +389,44 @@ convert a particular CSV data file into meaningful journal transactions.
 -}
 data CsvRules = CsvRules {
  rdirectives        :: [(DirectiveName,String)],
    -- ^ top-level rules, as (keyword, value) pairs
  rcsvfieldindexes   :: [(CsvFieldName, CsvFieldIndex)],
    -- ^ csv field names and their column number, if declared by a fields list
  rassignments       :: [(JournalFieldName, FieldTemplate)],
    -- ^ top-level assignments to hledger fields, as (field name, value template) pairs
  rconditionalblocks :: [ConditionalBlock]
    -- ^ conditional blocks, which containing additional assignments/rules to apply to matched csv records
 } deriving (Show, Eq)
 type CsvRulesParser a = StateT CsvRules SimpleTextParser a
 -- | The keyword of a CSV rule - "fields", "skip", "if", etc.
 type DirectiveName    = String
 -- | CSV field name.
 type CsvFieldName     = String
 -- | 1-based CSV column number.
 type CsvFieldIndex    = Int
-type JournalFieldName = String
+-- | Percent symbol followed by a CSV field name or column number. Eg: %date, %1.
 type CsvFieldReference = String
 -- | One of the standard hledger field names that can be assigned to.
 type JournalFieldName = String  -- XXX rename to HledgerFieldName
 -- | A text value to be assigned to a hledger field, possibly
 -- containing csv field references to be interpolated.
 type FieldTemplate    = String
 -- | A strptime date parsing pattern, as supported by Data.Time.Format.
 type DateFormat       = String
 -- | A regular expression.
 type RegexpPattern    = String
 -- | A single test for matching a CSV record, in one way or another.
 data Matcher =
    RecordMatcher RegexpPattern              -- ^ match if this regexp matches the overall CSV record
-  -- | FieldMatcher CsvFieldName RegexpPattern  -- ^ match if this regexp matches the named CSV field
+  | FieldMatcher CsvFieldReference RegexpPattern  -- ^ match if this regexp matches the referenced CSV field's value
  deriving (Show, Eq)
 -- | A conditional block: a set of CSV record matchers, and a sequence
-- of zero or more rules which will be enabled only when one or of the
+-- of rules which will be enabled only if one or more of the matchers
-- matchers succeeds.
+-- succeeds.
 --
 -- Three types of rule are allowed inside conditional blocks: field
 -- assignments, skip, end. (A skip or end rule is stored as if it was
@ -634,6 +658,7 @@ fieldvalp = do
  lift $ dbgparse 2 "trying fieldvalp"
  anySingle `manyTill` lift eolof
 -- A conditional block: one or more matchers, one per line, followed by one or more indented rules.
 conditionalblockp :: CsvRulesParser ConditionalBlock
 conditionalblockp = do
  lift $ dbgparse 3 "trying conditionalblockp"
@ -645,10 +670,14 @@ conditionalblockp = do
  return $ CB{cbMatchers=ms, cbAssignments=as}
  <?> "conditional block"
-- A single matcher, on one line
+-- A single matcher, on one line.
 -- XXX Currently only parses a RecordMatcher
 matcherp :: CsvRulesParser Matcher
-matcherp = do
+matcherp = try fieldmatcherp <|> recordmatcherp
 -- A single whole-record matcher.
 -- A pattern on the whole line, not containing any of the match operators (~).
 recordmatcherp :: CsvRulesParser Matcher
 recordmatcherp = do
  lift $ dbgparse 2 "trying matcherp"
  -- pos <- currentPos
  _  <- optional (matchoperatorp >> lift (skipMany spacenonewline) >> optional newline)
@ -658,8 +687,35 @@ matcherp = do
  return $ RecordMatcher r
  <?> "record matcher"
-- An operator indicating the type of match
+-- | A single matcher for a specific field. A csv field reference (like %date or %1),
-- XXX currently only ~ (regex), unused
+-- a match operator (~), and a pattern on the rest of the line, optionally
 -- space-separated. Eg:
 -- %description ~ chez jacques
 fieldmatcherp :: CsvRulesParser Matcher
 fieldmatcherp = do
  lift $ dbgparse 2 "trying fieldmatcher"
  -- An optional fieldname (default: "all")
  -- f <- fromMaybe "all" `fmap` (optional $ do
  --        f' <- fieldnamep
  --        lift (skipMany spacenonewline)
  --        return f')
  f <- csvfieldreferencep <* lift (skipMany spacenonewline)
  -- optional operator.. just ~ (case insensitive infix regex) for now
  _op <- fromMaybe "~" <$> optional matchoperatorp
  lift (skipMany spacenonewline)
  r <- regexp
  return $ FieldMatcher f r
  <?> "field matcher"
 csvfieldreferencep :: CsvRulesParser CsvFieldReference
 csvfieldreferencep = do
  lift $ dbgparse 3 "trying csvfieldreferencep"
  char '%'
  f <- fieldnamep
  return $ '%' : quoteIfNeeded f
 -- A match operator, indicating the type of match to perform.
 -- Currently just ~ meaning case insensitive infix regex match.
 matchoperatorp :: CsvRulesParser String
 matchoperatorp = fmap T.unpack $ choiceInState $ map string
  ["~"
@ -677,19 +733,6 @@ regexp = do
  cs <- anySingle `manyTill` lift eolof
  return $ strip $ c:cs
 -- fieldmatcher = do
 --   dbgparse 2 "trying fieldmatcher"
 --   f <- fromMaybe "all" `fmap` (optional $ do
 --          f' <- fieldname
 --          lift (skipMany spacenonewline)
 --          return f')
 --   char '~'
 --   lift (skipMany spacenonewline)
 --   ps <- patterns
 --   let r = "(" ++ intercalate "|" ps ++ ")"
 --   return (f,r)
 --   <?> "field matcher"
 --------------------------------------------------------------------------------
 -- Converting CSV records to journal transactions
@ -962,40 +1005,64 @@ showRecord :: CsvRecord -> String
 showRecord r = "the CSV record is:       "++intercalate "," (map show r)
 -- | Given the conversion rules, a CSV record and a journal entry field name, find
-- the template value ultimately assigned to this field, either at top
+-- the template value ultimately assigned to this field, if any,
-- level or in a matching conditional block.  Conditional blocks'
+-- by a field assignment at top level or in a conditional block matching this record.
-- patterns are matched against an approximation of the original CSV
+--
-- record: all the field values with commas intercalated.
+-- Note conditional blocks' patterns are matched against an approximation of the
 -- CSV record: all the field values, without enclosing quotes, comma-separated.
 --
 getEffectiveAssignment :: CsvRules -> CsvRecord -> JournalFieldName -> Maybe FieldTemplate
-getEffectiveAssignment rules record f = lastMay $ assignmentsFor f
+getEffectiveAssignment rules record f = lastMay $ map snd $ assignments
  where
-    assignmentsFor f = map snd $ filter ((==f).fst) $ toplevelassignments ++ conditionalassignments
+    -- all active assignments to field f, in order
    assignments = dbg2 "assignments" $ filter ((==f).fst) $ toplevelassignments ++ conditionalassignments
      where
        -- all top level field assignments
        toplevelassignments    = rassignments rules
-        conditionalassignments = concatMap cbAssignments $ filter blockMatches $ blocksAssigning f
+        -- all field assignments in conditional blocks assigning to field f and active for the current csv record
        conditionalassignments = concatMap cbAssignments $ filter isBlockActive $ blocksAssigning f
          where
            -- all conditional blocks which can potentially assign field f
            blocksAssigning f = filter (any ((==f).fst) . cbAssignments) $ rconditionalblocks rules
-            blockMatches :: ConditionalBlock -> Bool
+            -- does this conditional block match the current csv record ?
-            blockMatches CB{..} = any matcherMatches cbMatchers
+            isBlockActive :: ConditionalBlock -> Bool
            isBlockActive CB{..} = any matcherMatches cbMatchers
              where
                -- does this individual matcher match the current csv record ?
                matcherMatches :: Matcher -> Bool
-                matcherMatches (RecordMatcher pat) = regexMatchesCI pat csvline
+                matcherMatches (RecordMatcher pat) = regexMatchesCI pat wholecsvline
                  where
-                    csvline = intercalate "," record
+                    -- a synthetic whole CSV record to match against; note, it has
-                -- matcherMatches (FieldMatcher field pat) = undefined
+                    -- no quotes enclosing fields, and is always comma-separated,
                    -- so may differ from the actual record, and may not be valid CSV.
                    wholecsvline = dbg3 "wholecsvline" $ intercalate "," record
                matcherMatches (FieldMatcher csvfieldref pat) = regexMatchesCI pat csvfieldvalue
                  where
                    -- the value of the referenced CSV field to match against.
                    csvfieldvalue = dbg3 "csvfieldvalue" $ replaceCsvFieldReference rules record csvfieldref
 -- | Render a field assigment's template, possibly interpolating referenced
 -- CSV field values. Outer whitespace is removed from interpolated values.
 renderTemplate ::  CsvRules -> CsvRecord -> FieldTemplate -> String
-renderTemplate rules record t = regexReplaceBy "%[A-z0-9_-]+" replace t
+renderTemplate rules record t = regexReplaceBy "%[A-z0-9_-]+" (replaceCsvFieldReference rules record) t
  where
    replace ('%':pat) = maybe pat (\i -> strip $ atDef "" record (i-1)) mindex
      where
        mindex | all isDigit pat = readMay pat
               | otherwise       = lookup (map toLower pat) $ rcsvfieldindexes rules
    replace pat       = pat
-- Parse the date string using the specified date-format, or if unspecified try these default formats:
+-- | Replace something that looks like a reference to a csv field ("%date" or "%1)
 -- with that field's value. If it doesn't look like a field reference, or if we
 -- can't find such a field, leave it unchanged.
 replaceCsvFieldReference :: CsvRules -> CsvRecord -> CsvFieldReference -> String
 replaceCsvFieldReference rules record s@('%':fieldname) = fromMaybe s $ csvFieldValue rules record fieldname
 replaceCsvFieldReference _ _ s = s
 -- | Get the (whitespace-stripped) value of a CSV field, identified by its name or
 -- column number, ("date" or "1"), from the given CSV record, if such a field exists.
 csvFieldValue :: CsvRules -> CsvRecord -> CsvFieldName -> Maybe String
 csvFieldValue rules record fieldname = do
  fieldindex <- if | all isDigit fieldname -> readMay fieldname
                   | otherwise             -> lookup (map toLower fieldname) $ rcsvfieldindexes rules
  fieldvalue <- strip <$> atMay record (fieldindex-1)
  return fieldvalue
 -- | Parse the date string using the specified date-format, or if unspecified try these default formats:
 -- YYYY/MM/DD, YYYY-MM-DD, YYYY.MM.DD, MM/DD/YYYY (month and day can be 1 or 2 digits, year must be 4).
 parseDateWithFormatOrDefaultFormats :: Maybe DateFormat -> String -> Maybe Day
 parseDateWithFormatOrDefaultFormats mformat s = firstJust $ map parsewith formats
@ -1025,26 +1092,60 @@ parseDateWithFormatOrDefaultFormats mformat s = firstJust $ map parsewith format
 tests_CsvReader = tests "CsvReader" [
   tests "parseCsvRules" [
-     test"empty file" $
+     test "empty file" $
      parseCsvRules "unknown" "" @?= Right defrules
-    ]
+   ]
  ,tests "rulesp" [
-     test"trailing comments" $
+     test "trailing comments" $
      parseWithState' defrules rulesp "skip\n# \n#\n" @?= Right defrules{rdirectives = [("skip","")]}
-    ,test"trailing blank lines" $
+    ,test "trailing blank lines" $
      parseWithState' defrules rulesp "skip\n\n  \n" @?= (Right defrules{rdirectives = [("skip","")]})
-    ,test"no final newline" $
+    ,test "no final newline" $
      parseWithState' defrules rulesp "skip" @?= (Right defrules{rdirectives=[("skip","")]})
-    ,test"assignment with empty value" $
+    ,test "assignment with empty value" $
      parseWithState' defrules rulesp "account1 \nif foo\n  account2 foo\n" @?=
        (Right defrules{rassignments = [("account1","")], rconditionalblocks = [CB{cbMatchers=[RecordMatcher "foo"],cbAssignments=[("account2","foo")]}]})
-  ]
+   ]
  ,tests "conditionalblockp" [
-    test"space after conditional" $ -- #1120
+    test "space after conditional" $ -- #1120
      parseWithState' defrules conditionalblockp "if a\n account2 b\n \n" @?=
        (Right $ CB{cbMatchers=[RecordMatcher "a"],cbAssignments=[("account2","b")]})
  ,tests "csvfieldreferencep" [
    test "number" $ parseWithState' defrules csvfieldreferencep "%1" @?= (Right "%1")
   ,test "name" $ parseWithState' defrules csvfieldreferencep "%date" @?= (Right "%date")
   ,test "quoted name" $ parseWithState' defrules csvfieldreferencep "%\"csv date\"" @?= (Right "%\"csv date\"")
   ]
  ,tests "matcherp" [
    test "recordmatcherp" $
      parseWithState' defrules matcherp "A A\n" @?= (Right $ RecordMatcher "A A")
   ,test "fieldmatcherp.starts-with-%" $
      parseWithState' defrules matcherp "description A A\n" @?= (Right $ RecordMatcher "description A A")
   ,test "fieldmatcherp" $
      parseWithState' defrules matcherp "%description A A\n" @?= (Right $ FieldMatcher "%description" "A A")
   ,test "fieldmatcherp with operator" $
      parseWithState' defrules matcherp "%description ~ A A\n" @?= (Right $ FieldMatcher "%description" "A A")
   ]
  ,tests "getEffectiveAssignment" [
    let rules = defrules{rcsvfieldindexes=[("csvdate",1)],rassignments=[("date","%csvdate")]}
    in test "toplevel" $ getEffectiveAssignment rules ["a","b"] "date" @?= (Just "%csvdate")
   ,let rules = defrules{rcsvfieldindexes=[("csvdate",1)], rconditionalblocks=[CB [FieldMatcher "%csvdate" "a"] [("date","%csvdate")]]}
    in test "conditional" $ getEffectiveAssignment rules ["a","b"] "date" @?= (Just "%csvdate")
   ]
  ]
-  ]
+
 ]
--- a/hledger-lib/hledger_csv.m4.md
+++ b/hledger-lib/hledger_csv.m4.md
@ -495,13 +495,13 @@ See also: [File Extension](#file-extension).
 ## `if`
 ```rules
-if PATTERN
+if MATCHER
 RULE
 if
-PATTERN
+MATCHER
-PATTERN
+MATCHER
-PATTERN
+MATCHER
 RULE
 RULE
 ```
@ -510,22 +510,29 @@ Conditional blocks ("if blocks") are a block of rules that are applied
 only to CSV records which match certain patterns. They are often used
 for customising account names based on transaction descriptions.
-A single pattern can be written on the same line as the "if";
+Each MATCHER can be a record matcher, which looks like this:
 or multiple patterns can be written on the following lines, non-indented.
 Multiple patterns are OR'd (any one of them can match).
 Patterns are case-insensitive regular expressions
 which try to match anywhere within the whole CSV record
 (POSIX extended regular expressions with some additions, see https://hledger.org/hledger.html#regular-expressions).
 Note the CSV record they see is close to, but not identical to, the one in the CSV file;
 enclosing double quotes will be removed, and the separator character is always comma.
 It's not yet easy to match within a specific field.
 If the data does not contain commas, you can hack it with a regular expression like:
 ```rules
-# match "foo" in the fourth field
+REGEX
 if ^([^,]*,){3}foo
 ```
 REGEX is a case-insensitive regular expression which tries to match anywhere within the CSV record.
 It is a POSIX extended regular expressions with some additions (see 
 [Regular expressions](https://hledger.org/hledger.html#regular-expressions) in the hledger manual).
 Note: the "CSV record" it is matched against is not the original record, but a synthetic one,
 with enclosing double quotes or whitespace removed, and always comma-separated.
 (Eg, an SSV record `2020-01-01; "Acme, Inc."; 1,000` appears to REGEX as `2020-01-01,Acme, Inc.,1,000`).
 Or, MATCHER can be a field matcher, like this:
 ```rules
 %CSVFIELD REGEX
 ```
 which matches just the content of a particular CSV field.
 CSVFIELD is a percent sign followed by the field's name or column number, like `%date` or `%1`.
 A single matcher can be written on the same line as the "if";
 or multiple matchers can be written on the following lines, non-indented.
 Multiple matchers are OR'd (any one of them can match).
 After the patterns there should be one or more rules to apply, all
 indented by at least one space. Three kinds of rule are allowed in
 conditional blocks:
--- a/tests/csv.test
+++ b/tests/csv.test
@ -560,8 +560,26 @@ $  ./hledger-csv
 >=0
 # 27. match a specific field
 <
 2020-01-01, 1
 2020-01-01, 2
 RULES
 fields date, desc
 if %desc 1
 description one
-## 26. A single unbalanced posting with number other than 1 also should not generate a balancing posting.
+$  ./hledger-csv desc:one
 2020-01-01 one
 >=0
 ## . 
 #<
 #$  ./hledger-csv
 #>=0
 ## . A single unbalanced posting with number other than 1 also should not generate a balancing posting.
 #<
 #2019-01-01,1
 #
@ -575,7 +593,7 @@ $  ./hledger-csv
 #
 #>=0
 #
-## 27. A single posting that's zero also should not generate a balancing posting.
+## . A single posting that's zero also should not generate a balancing posting.
 #<
 #2019-01-01,0
 #
@ -589,7 +607,7 @@ $  ./hledger-csv
 #
 #>=0
-## 28. With a bracketed account name, the auto-generated second posting should also be bracketed.
+## . With a bracketed account name, the auto-generated second posting should also be bracketed.
 #<
 #2019-01-01,1
 #
@ -604,8 +622,6 @@ $  ./hledger-csv
 #
 #>=0
 # . TODO: without --separator gives obscure error
 #   |
 # 1 | 10/2009/09;Flubber Co🎅;50;