csv: allow single field matching; more docs and tests

This commit is contained in:
Simon Michael 2020-02-25 17:54:16 -08:00
parent becd891dd1
commit d537f1fe07
3 changed files with 199 additions and 75 deletions

View File

@ -3,6 +3,10 @@
A reader for CSV data, using an extra rules file to help interpret the data. A reader for CSV data, using an extra rules file to help interpret the data.
-} -}
-- Lots of haddocks in this file are for non-exported types.
-- Here's a command that will render them:
-- stack haddock hledger-lib --fast --no-haddock-deps --haddock-arguments='--ignore-all-exports' --open
{-# LANGUAGE CPP #-} {-# LANGUAGE CPP #-}
{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE ScopedTypeVariables #-}
@ -12,6 +16,8 @@ A reader for CSV data, using an extra rules file to help interpret the data.
{-# LANGUAGE TypeSynonymInstances #-} {-# LANGUAGE TypeSynonymInstances #-}
{-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE PackageImports #-} {-# LANGUAGE PackageImports #-}
{-# LANGUAGE MultiWayIf #-}
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE RecordWildCards #-} {-# LANGUAGE RecordWildCards #-}
module Hledger.Read.CsvReader ( module Hledger.Read.CsvReader (
@ -115,15 +121,19 @@ parseSeparator = specials . map toLower
-- | Read a Journal from the given CSV data (and filename, used for error -- | Read a Journal from the given CSV data (and filename, used for error
-- messages), or return an error. Proceed as follows: -- messages), or return an error. Proceed as follows:
-- @ --
-- 1. parse CSV conversion rules from the specified rules file, or from -- 1. parse CSV conversion rules from the specified rules file, or from
-- the default rules file for the specified CSV file, if it exists, -- the default rules file for the specified CSV file, if it exists,
-- or throw a parse error; if it doesn't exist, use built-in default rules -- or throw a parse error; if it doesn't exist, use built-in default rules
--
-- 2. parse the CSV data, or throw a parse error -- 2. parse the CSV data, or throw a parse error
--
-- 3. convert the CSV records to transactions using the rules -- 3. convert the CSV records to transactions using the rules
--
-- 4. if the rules file didn't exist, create it with the default rules and filename -- 4. if the rules file didn't exist, create it with the default rules and filename
--
-- 5. return the transactions as a Journal -- 5. return the transactions as a Journal
-- @ --
readJournalFromCsv :: Maybe FilePath -> FilePath -> Text -> IO (Either String Journal) readJournalFromCsv :: Maybe FilePath -> FilePath -> Text -> IO (Either String Journal)
readJournalFromCsv Nothing "-" _ = return $ Left "please use --rules-file when reading CSV from stdin" readJournalFromCsv Nothing "-" _ = return $ Left "please use --rules-file when reading CSV from stdin"
readJournalFromCsv mrulesfile csvfile csvdata = readJournalFromCsv mrulesfile csvfile csvdata =
@ -379,30 +389,44 @@ convert a particular CSV data file into meaningful journal transactions.
-} -}
data CsvRules = CsvRules { data CsvRules = CsvRules {
rdirectives :: [(DirectiveName,String)], rdirectives :: [(DirectiveName,String)],
-- ^ top-level rules, as (keyword, value) pairs
rcsvfieldindexes :: [(CsvFieldName, CsvFieldIndex)], rcsvfieldindexes :: [(CsvFieldName, CsvFieldIndex)],
-- ^ csv field names and their column number, if declared by a fields list
rassignments :: [(JournalFieldName, FieldTemplate)], rassignments :: [(JournalFieldName, FieldTemplate)],
-- ^ top-level assignments to hledger fields, as (field name, value template) pairs
rconditionalblocks :: [ConditionalBlock] rconditionalblocks :: [ConditionalBlock]
-- ^ conditional blocks, which containing additional assignments/rules to apply to matched csv records
} deriving (Show, Eq) } deriving (Show, Eq)
type CsvRulesParser a = StateT CsvRules SimpleTextParser a type CsvRulesParser a = StateT CsvRules SimpleTextParser a
-- | The keyword of a CSV rule - "fields", "skip", "if", etc.
type DirectiveName = String type DirectiveName = String
-- | CSV field name.
type CsvFieldName = String type CsvFieldName = String
-- | 1-based CSV column number.
type CsvFieldIndex = Int type CsvFieldIndex = Int
type JournalFieldName = String -- | Percent symbol followed by a CSV field name or column number. Eg: %date, %1.
type CsvFieldReference = String
-- | One of the standard hledger field names that can be assigned to.
type JournalFieldName = String -- XXX rename to HledgerFieldName
-- | A text value to be assigned to a hledger field, possibly
-- containing csv field references to be interpolated.
type FieldTemplate = String type FieldTemplate = String
-- | A strptime date parsing pattern, as supported by Data.Time.Format.
type DateFormat = String type DateFormat = String
-- | A regular expression.
type RegexpPattern = String type RegexpPattern = String
-- | A single test for matching a CSV record, in one way or another. -- | A single test for matching a CSV record, in one way or another.
data Matcher = data Matcher =
RecordMatcher RegexpPattern -- ^ match if this regexp matches the overall CSV record RecordMatcher RegexpPattern -- ^ match if this regexp matches the overall CSV record
-- | FieldMatcher CsvFieldName RegexpPattern -- ^ match if this regexp matches the named CSV field | FieldMatcher CsvFieldReference RegexpPattern -- ^ match if this regexp matches the referenced CSV field's value
deriving (Show, Eq) deriving (Show, Eq)
-- | A conditional block: a set of CSV record matchers, and a sequence -- | A conditional block: a set of CSV record matchers, and a sequence
-- of zero or more rules which will be enabled only when one or of the -- of rules which will be enabled only if one or more of the matchers
-- matchers succeeds. -- succeeds.
-- --
-- Three types of rule are allowed inside conditional blocks: field -- Three types of rule are allowed inside conditional blocks: field
-- assignments, skip, end. (A skip or end rule is stored as if it was -- assignments, skip, end. (A skip or end rule is stored as if it was
@ -634,6 +658,7 @@ fieldvalp = do
lift $ dbgparse 2 "trying fieldvalp" lift $ dbgparse 2 "trying fieldvalp"
anySingle `manyTill` lift eolof anySingle `manyTill` lift eolof
-- A conditional block: one or more matchers, one per line, followed by one or more indented rules.
conditionalblockp :: CsvRulesParser ConditionalBlock conditionalblockp :: CsvRulesParser ConditionalBlock
conditionalblockp = do conditionalblockp = do
lift $ dbgparse 3 "trying conditionalblockp" lift $ dbgparse 3 "trying conditionalblockp"
@ -645,10 +670,14 @@ conditionalblockp = do
return $ CB{cbMatchers=ms, cbAssignments=as} return $ CB{cbMatchers=ms, cbAssignments=as}
<?> "conditional block" <?> "conditional block"
-- A single matcher, on one line -- A single matcher, on one line.
-- XXX Currently only parses a RecordMatcher
matcherp :: CsvRulesParser Matcher matcherp :: CsvRulesParser Matcher
matcherp = do matcherp = try fieldmatcherp <|> recordmatcherp
-- A single whole-record matcher.
-- A pattern on the whole line, not containing any of the match operators (~).
recordmatcherp :: CsvRulesParser Matcher
recordmatcherp = do
lift $ dbgparse 2 "trying matcherp" lift $ dbgparse 2 "trying matcherp"
-- pos <- currentPos -- pos <- currentPos
_ <- optional (matchoperatorp >> lift (skipMany spacenonewline) >> optional newline) _ <- optional (matchoperatorp >> lift (skipMany spacenonewline) >> optional newline)
@ -658,8 +687,35 @@ matcherp = do
return $ RecordMatcher r return $ RecordMatcher r
<?> "record matcher" <?> "record matcher"
-- An operator indicating the type of match -- | A single matcher for a specific field. A csv field reference (like %date or %1),
-- XXX currently only ~ (regex), unused -- a match operator (~), and a pattern on the rest of the line, optionally
-- space-separated. Eg:
-- %description ~ chez jacques
fieldmatcherp :: CsvRulesParser Matcher
fieldmatcherp = do
lift $ dbgparse 2 "trying fieldmatcher"
-- An optional fieldname (default: "all")
-- f <- fromMaybe "all" `fmap` (optional $ do
-- f' <- fieldnamep
-- lift (skipMany spacenonewline)
-- return f')
f <- csvfieldreferencep <* lift (skipMany spacenonewline)
-- optional operator.. just ~ (case insensitive infix regex) for now
_op <- fromMaybe "~" <$> optional matchoperatorp
lift (skipMany spacenonewline)
r <- regexp
return $ FieldMatcher f r
<?> "field matcher"
csvfieldreferencep :: CsvRulesParser CsvFieldReference
csvfieldreferencep = do
lift $ dbgparse 3 "trying csvfieldreferencep"
char '%'
f <- fieldnamep
return $ '%' : quoteIfNeeded f
-- A match operator, indicating the type of match to perform.
-- Currently just ~ meaning case insensitive infix regex match.
matchoperatorp :: CsvRulesParser String matchoperatorp :: CsvRulesParser String
matchoperatorp = fmap T.unpack $ choiceInState $ map string matchoperatorp = fmap T.unpack $ choiceInState $ map string
["~" ["~"
@ -677,19 +733,6 @@ regexp = do
cs <- anySingle `manyTill` lift eolof cs <- anySingle `manyTill` lift eolof
return $ strip $ c:cs return $ strip $ c:cs
-- fieldmatcher = do
-- dbgparse 2 "trying fieldmatcher"
-- f <- fromMaybe "all" `fmap` (optional $ do
-- f' <- fieldname
-- lift (skipMany spacenonewline)
-- return f')
-- char '~'
-- lift (skipMany spacenonewline)
-- ps <- patterns
-- let r = "(" ++ intercalate "|" ps ++ ")"
-- return (f,r)
-- <?> "field matcher"
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
-- Converting CSV records to journal transactions -- Converting CSV records to journal transactions
@ -962,40 +1005,64 @@ showRecord :: CsvRecord -> String
showRecord r = "the CSV record is: "++intercalate "," (map show r) showRecord r = "the CSV record is: "++intercalate "," (map show r)
-- | Given the conversion rules, a CSV record and a journal entry field name, find -- | Given the conversion rules, a CSV record and a journal entry field name, find
-- the template value ultimately assigned to this field, either at top -- the template value ultimately assigned to this field, if any,
-- level or in a matching conditional block. Conditional blocks' -- by a field assignment at top level or in a conditional block matching this record.
-- patterns are matched against an approximation of the original CSV --
-- record: all the field values with commas intercalated. -- Note conditional blocks' patterns are matched against an approximation of the
-- CSV record: all the field values, without enclosing quotes, comma-separated.
--
getEffectiveAssignment :: CsvRules -> CsvRecord -> JournalFieldName -> Maybe FieldTemplate getEffectiveAssignment :: CsvRules -> CsvRecord -> JournalFieldName -> Maybe FieldTemplate
getEffectiveAssignment rules record f = lastMay $ assignmentsFor f getEffectiveAssignment rules record f = lastMay $ map snd $ assignments
where where
assignmentsFor f = map snd $ filter ((==f).fst) $ toplevelassignments ++ conditionalassignments -- all active assignments to field f, in order
assignments = dbg2 "assignments" $ filter ((==f).fst) $ toplevelassignments ++ conditionalassignments
where where
-- all top level field assignments
toplevelassignments = rassignments rules toplevelassignments = rassignments rules
conditionalassignments = concatMap cbAssignments $ filter blockMatches $ blocksAssigning f -- all field assignments in conditional blocks assigning to field f and active for the current csv record
conditionalassignments = concatMap cbAssignments $ filter isBlockActive $ blocksAssigning f
where where
-- all conditional blocks which can potentially assign field f
blocksAssigning f = filter (any ((==f).fst) . cbAssignments) $ rconditionalblocks rules blocksAssigning f = filter (any ((==f).fst) . cbAssignments) $ rconditionalblocks rules
blockMatches :: ConditionalBlock -> Bool -- does this conditional block match the current csv record ?
blockMatches CB{..} = any matcherMatches cbMatchers isBlockActive :: ConditionalBlock -> Bool
isBlockActive CB{..} = any matcherMatches cbMatchers
where where
-- does this individual matcher match the current csv record ?
matcherMatches :: Matcher -> Bool matcherMatches :: Matcher -> Bool
matcherMatches (RecordMatcher pat) = regexMatchesCI pat csvline matcherMatches (RecordMatcher pat) = regexMatchesCI pat wholecsvline
where where
csvline = intercalate "," record -- a synthetic whole CSV record to match against; note, it has
-- matcherMatches (FieldMatcher field pat) = undefined -- no quotes enclosing fields, and is always comma-separated,
-- so may differ from the actual record, and may not be valid CSV.
wholecsvline = dbg3 "wholecsvline" $ intercalate "," record
matcherMatches (FieldMatcher csvfieldref pat) = regexMatchesCI pat csvfieldvalue
where
-- the value of the referenced CSV field to match against.
csvfieldvalue = dbg3 "csvfieldvalue" $ replaceCsvFieldReference rules record csvfieldref
-- | Render a field assigment's template, possibly interpolating referenced -- | Render a field assigment's template, possibly interpolating referenced
-- CSV field values. Outer whitespace is removed from interpolated values. -- CSV field values. Outer whitespace is removed from interpolated values.
renderTemplate :: CsvRules -> CsvRecord -> FieldTemplate -> String renderTemplate :: CsvRules -> CsvRecord -> FieldTemplate -> String
renderTemplate rules record t = regexReplaceBy "%[A-z0-9_-]+" replace t renderTemplate rules record t = regexReplaceBy "%[A-z0-9_-]+" (replaceCsvFieldReference rules record) t
where
replace ('%':pat) = maybe pat (\i -> strip $ atDef "" record (i-1)) mindex
where
mindex | all isDigit pat = readMay pat
| otherwise = lookup (map toLower pat) $ rcsvfieldindexes rules
replace pat = pat
-- Parse the date string using the specified date-format, or if unspecified try these default formats: -- | Replace something that looks like a reference to a csv field ("%date" or "%1)
-- with that field's value. If it doesn't look like a field reference, or if we
-- can't find such a field, leave it unchanged.
replaceCsvFieldReference :: CsvRules -> CsvRecord -> CsvFieldReference -> String
replaceCsvFieldReference rules record s@('%':fieldname) = fromMaybe s $ csvFieldValue rules record fieldname
replaceCsvFieldReference _ _ s = s
-- | Get the (whitespace-stripped) value of a CSV field, identified by its name or
-- column number, ("date" or "1"), from the given CSV record, if such a field exists.
csvFieldValue :: CsvRules -> CsvRecord -> CsvFieldName -> Maybe String
csvFieldValue rules record fieldname = do
fieldindex <- if | all isDigit fieldname -> readMay fieldname
| otherwise -> lookup (map toLower fieldname) $ rcsvfieldindexes rules
fieldvalue <- strip <$> atMay record (fieldindex-1)
return fieldvalue
-- | Parse the date string using the specified date-format, or if unspecified try these default formats:
-- YYYY/MM/DD, YYYY-MM-DD, YYYY.MM.DD, MM/DD/YYYY (month and day can be 1 or 2 digits, year must be 4). -- YYYY/MM/DD, YYYY-MM-DD, YYYY.MM.DD, MM/DD/YYYY (month and day can be 1 or 2 digits, year must be 4).
parseDateWithFormatOrDefaultFormats :: Maybe DateFormat -> String -> Maybe Day parseDateWithFormatOrDefaultFormats :: Maybe DateFormat -> String -> Maybe Day
parseDateWithFormatOrDefaultFormats mformat s = firstJust $ map parsewith formats parseDateWithFormatOrDefaultFormats mformat s = firstJust $ map parsewith formats
@ -1025,26 +1092,60 @@ parseDateWithFormatOrDefaultFormats mformat s = firstJust $ map parsewith format
tests_CsvReader = tests "CsvReader" [ tests_CsvReader = tests "CsvReader" [
tests "parseCsvRules" [ tests "parseCsvRules" [
test"empty file" $ test "empty file" $
parseCsvRules "unknown" "" @?= Right defrules parseCsvRules "unknown" "" @?= Right defrules
] ]
,tests "rulesp" [ ,tests "rulesp" [
test"trailing comments" $ test "trailing comments" $
parseWithState' defrules rulesp "skip\n# \n#\n" @?= Right defrules{rdirectives = [("skip","")]} parseWithState' defrules rulesp "skip\n# \n#\n" @?= Right defrules{rdirectives = [("skip","")]}
,test"trailing blank lines" $ ,test "trailing blank lines" $
parseWithState' defrules rulesp "skip\n\n \n" @?= (Right defrules{rdirectives = [("skip","")]}) parseWithState' defrules rulesp "skip\n\n \n" @?= (Right defrules{rdirectives = [("skip","")]})
,test"no final newline" $ ,test "no final newline" $
parseWithState' defrules rulesp "skip" @?= (Right defrules{rdirectives=[("skip","")]}) parseWithState' defrules rulesp "skip" @?= (Right defrules{rdirectives=[("skip","")]})
,test"assignment with empty value" $ ,test "assignment with empty value" $
parseWithState' defrules rulesp "account1 \nif foo\n account2 foo\n" @?= parseWithState' defrules rulesp "account1 \nif foo\n account2 foo\n" @?=
(Right defrules{rassignments = [("account1","")], rconditionalblocks = [CB{cbMatchers=[RecordMatcher "foo"],cbAssignments=[("account2","foo")]}]}) (Right defrules{rassignments = [("account1","")], rconditionalblocks = [CB{cbMatchers=[RecordMatcher "foo"],cbAssignments=[("account2","foo")]}]})
] ]
,tests "conditionalblockp" [ ,tests "conditionalblockp" [
test"space after conditional" $ -- #1120 test "space after conditional" $ -- #1120
parseWithState' defrules conditionalblockp "if a\n account2 b\n \n" @?= parseWithState' defrules conditionalblockp "if a\n account2 b\n \n" @?=
(Right $ CB{cbMatchers=[RecordMatcher "a"],cbAssignments=[("account2","b")]}) (Right $ CB{cbMatchers=[RecordMatcher "a"],cbAssignments=[("account2","b")]})
,tests "csvfieldreferencep" [
test "number" $ parseWithState' defrules csvfieldreferencep "%1" @?= (Right "%1")
,test "name" $ parseWithState' defrules csvfieldreferencep "%date" @?= (Right "%date")
,test "quoted name" $ parseWithState' defrules csvfieldreferencep "%\"csv date\"" @?= (Right "%\"csv date\"")
]
,tests "matcherp" [
test "recordmatcherp" $
parseWithState' defrules matcherp "A A\n" @?= (Right $ RecordMatcher "A A")
,test "fieldmatcherp.starts-with-%" $
parseWithState' defrules matcherp "description A A\n" @?= (Right $ RecordMatcher "description A A")
,test "fieldmatcherp" $
parseWithState' defrules matcherp "%description A A\n" @?= (Right $ FieldMatcher "%description" "A A")
,test "fieldmatcherp with operator" $
parseWithState' defrules matcherp "%description ~ A A\n" @?= (Right $ FieldMatcher "%description" "A A")
]
,tests "getEffectiveAssignment" [
let rules = defrules{rcsvfieldindexes=[("csvdate",1)],rassignments=[("date","%csvdate")]}
in test "toplevel" $ getEffectiveAssignment rules ["a","b"] "date" @?= (Just "%csvdate")
,let rules = defrules{rcsvfieldindexes=[("csvdate",1)], rconditionalblocks=[CB [FieldMatcher "%csvdate" "a"] [("date","%csvdate")]]}
in test "conditional" $ getEffectiveAssignment rules ["a","b"] "date" @?= (Just "%csvdate")
]
] ]
]
]

View File

@ -495,13 +495,13 @@ See also: [File Extension](#file-extension).
## `if` ## `if`
```rules ```rules
if PATTERN if MATCHER
RULE RULE
if if
PATTERN MATCHER
PATTERN MATCHER
PATTERN MATCHER
RULE RULE
RULE RULE
``` ```
@ -510,22 +510,29 @@ Conditional blocks ("if blocks") are a block of rules that are applied
only to CSV records which match certain patterns. They are often used only to CSV records which match certain patterns. They are often used
for customising account names based on transaction descriptions. for customising account names based on transaction descriptions.
A single pattern can be written on the same line as the "if"; Each MATCHER can be a record matcher, which looks like this:
or multiple patterns can be written on the following lines, non-indented.
Multiple patterns are OR'd (any one of them can match).
Patterns are case-insensitive regular expressions
which try to match anywhere within the whole CSV record
(POSIX extended regular expressions with some additions, see https://hledger.org/hledger.html#regular-expressions).
Note the CSV record they see is close to, but not identical to, the one in the CSV file;
enclosing double quotes will be removed, and the separator character is always comma.
It's not yet easy to match within a specific field.
If the data does not contain commas, you can hack it with a regular expression like:
```rules ```rules
# match "foo" in the fourth field REGEX
if ^([^,]*,){3}foo
``` ```
REGEX is a case-insensitive regular expression which tries to match anywhere within the CSV record.
It is a POSIX extended regular expressions with some additions (see
[Regular expressions](https://hledger.org/hledger.html#regular-expressions) in the hledger manual).
Note: the "CSV record" it is matched against is not the original record, but a synthetic one,
with enclosing double quotes or whitespace removed, and always comma-separated.
(Eg, an SSV record `2020-01-01; "Acme, Inc."; 1,000` appears to REGEX as `2020-01-01,Acme, Inc.,1,000`).
Or, MATCHER can be a field matcher, like this:
```rules
%CSVFIELD REGEX
```
which matches just the content of a particular CSV field.
CSVFIELD is a percent sign followed by the field's name or column number, like `%date` or `%1`.
A single matcher can be written on the same line as the "if";
or multiple matchers can be written on the following lines, non-indented.
Multiple matchers are OR'd (any one of them can match).
After the patterns there should be one or more rules to apply, all After the patterns there should be one or more rules to apply, all
indented by at least one space. Three kinds of rule are allowed in indented by at least one space. Three kinds of rule are allowed in
conditional blocks: conditional blocks:

View File

@ -560,8 +560,26 @@ $ ./hledger-csv
>=0 >=0
# 27. match a specific field
<
2020-01-01, 1
2020-01-01, 2
RULES
fields date, desc
if %desc 1
description one
## 26. A single unbalanced posting with number other than 1 also should not generate a balancing posting. $ ./hledger-csv desc:one
2020-01-01 one
>=0
## .
#<
#$ ./hledger-csv
#>=0
## . A single unbalanced posting with number other than 1 also should not generate a balancing posting.
#< #<
#2019-01-01,1 #2019-01-01,1
# #
@ -575,7 +593,7 @@ $ ./hledger-csv
# #
#>=0 #>=0
# #
## 27. A single posting that's zero also should not generate a balancing posting. ## . A single posting that's zero also should not generate a balancing posting.
#< #<
#2019-01-01,0 #2019-01-01,0
# #
@ -589,7 +607,7 @@ $ ./hledger-csv
# #
#>=0 #>=0
## 28. With a bracketed account name, the auto-generated second posting should also be bracketed. ## . With a bracketed account name, the auto-generated second posting should also be bracketed.
#< #<
#2019-01-01,1 #2019-01-01,1
# #
@ -604,8 +622,6 @@ $ ./hledger-csv
# #
#>=0 #>=0
# . TODO: without --separator gives obscure error # . TODO: without --separator gives obscure error
# | # |
# 1 | 10/2009/09;Flubber Co🎅;50; # 1 | 10/2009/09;Flubber Co🎅;50;