feat: import: interpolate regex matches in field templates (#2009)

Replace occurrences of '\N' (where N is a positive number) in field
templates with the corresponding regular expression match group, if it
exists.

E.g. Warp the date to the first of the month for the second posting

    if %date (....-..)-..
        comment2 date:\1-01

E.g. Strip a prefix from an imported account name

    if %account1 liabilities:jon:(.*)
        account1 \1

Fixes #2009.

Signed-off-by: Jonathan Dowland <jon@dow.land>
This commit is contained in:
Jonathan Dowland 2023-10-30 11:10:35 +00:00 committed by Simon Michael
parent 549c47bca8
commit 8bfa382c68

View File

@ -372,10 +372,12 @@ JOURNAL-PSEUDO-FIELD: amount-in | amount-out | currency
ASSIGNMENT-SEPARATOR: SPACE | ( : SPACE? )
FIELD-VALUE: VALUE (possibly containing CSV-FIELD-REFERENCEs)
FIELD-VALUE: VALUE (possibly containing CSV-FIELD-REFERENCEs and REGEX-MATCHGROUP-REFERENCEs)
CSV-FIELD-REFERENCE: % CSV-FIELD
REGEX-MATCHGROUP-REFERENCE: \ DIGIT+
CSV-FIELD: ( FIELD-NAME | FIELD-NUMBER ) (corresponding to a CSV field)
FIELD-NUMBER: DIGIT+
@ -707,7 +709,7 @@ hledgerField = getEffectiveAssignment
-- | Look up the final value assigned to a hledger field, with csv field
-- references interpolated.
hledgerFieldValue :: CsvRules -> CsvRecord -> HledgerFieldName -> Maybe Text
hledgerFieldValue rules record = fmap (renderTemplate rules record) . hledgerField rules record
hledgerFieldValue rules record f = (fmap (renderTemplate rules record f) . hledgerField rules record) f
maybeNegate :: MatcherPrefix -> Bool -> Bool
maybeNegate Not origbool = not origbool
@ -725,61 +727,88 @@ getEffectiveAssignment rules record f = lastMay $ map snd $ assignments
where
-- all active assignments to field f, in order
assignments = dbg9 "csv assignments" $ filter ((==f).fst) $ toplevelassignments ++ conditionalassignments
where
-- all top level field assignments
toplevelassignments = rassignments rules
-- all field assignments in conditional blocks assigning to field f and active for the current csv record
conditionalassignments = concatMap cbAssignments $ filter isBlockActive $ (rblocksassigning rules) f
where
-- does this conditional block match the current csv record ?
isBlockActive :: ConditionalBlock -> Bool
isBlockActive CB{..} = any (all matcherMatches) $ groupedMatchers cbMatchers
where
-- does this individual matcher match the current csv record ?
matcherMatches :: Matcher -> Bool
matcherMatches (RecordMatcher prefix pat) = maybeNegate prefix origbool
where
pat' = dbg7 "regex" pat
-- A synthetic whole CSV record to match against. Note, this can be
-- different from the original CSV data:
-- - any whitespace surrounding field values is preserved
-- - any quotes enclosing field values are removed
-- - and the field separator is always comma
-- which means that a field containing a comma will look like two fields.
wholecsvline = dbg7 "wholecsvline" $ T.intercalate "," record
origbool = regexMatchText pat' wholecsvline
matcherMatches (FieldMatcher prefix csvfieldref pat) = maybeNegate prefix origbool
where
-- the value of the referenced CSV field to match against.
csvfieldvalue = dbg7 "csvfieldvalue" $ replaceCsvFieldReference rules record csvfieldref
origbool = regexMatchText pat csvfieldvalue
-- all top level field assignments
toplevelassignments = rassignments rules
-- all field assignments in conditional blocks assigning to field f and active for the current csv record
conditionalassignments = concatMap cbAssignments $ filter (isBlockActive rules record) $ (rblocksassigning rules) f
-- | Group matchers into associative pairs based on prefix, e.g.:
-- A
-- & B
-- C
-- D
-- & E
-- => [[A, B], [C], [D, E]]
groupedMatchers :: [Matcher] -> [[Matcher]]
groupedMatchers [] = []
groupedMatchers (x:xs) = (x:ys) : groupedMatchers zs
where
(ys, zs) = span (\y -> matcherPrefix y == And) xs
matcherPrefix :: Matcher -> MatcherPrefix
matcherPrefix (RecordMatcher prefix _) = prefix
matcherPrefix (FieldMatcher prefix _ _) = prefix
-- does this conditional block match the current csv record ?
isBlockActive :: CsvRules -> CsvRecord -> ConditionalBlock -> Bool
isBlockActive rules record CB{..} = any (all matcherMatches) $ groupedMatchers cbMatchers
where
-- does this individual matcher match the current csv record ?
matcherMatches :: Matcher -> Bool
matcherMatches (RecordMatcher prefix pat) = maybeNegate prefix origbool
where
pat' = dbg7 "regex" pat
-- A synthetic whole CSV record to match against. Note, this can be
-- different from the original CSV data:
-- - any whitespace surrounding field values is preserved
-- - any quotes enclosing field values are removed
-- - and the field separator is always comma
-- which means that a field containing a comma will look like two fields.
wholecsvline = dbg7 "wholecsvline" $ T.intercalate "," record
origbool = regexMatchText pat' wholecsvline
matcherMatches (FieldMatcher prefix csvfieldref pat) = maybeNegate prefix origbool
where
-- the value of the referenced CSV field to match against.
csvfieldvalue = dbg7 "csvfieldvalue" $ replaceCsvFieldReference rules record csvfieldref
origbool = regexMatchText pat csvfieldvalue
-- | Group matchers into associative pairs based on prefix, e.g.:
-- A
-- & B
-- C
-- D
-- & E
-- => [[A, B], [C], [D, E]]
groupedMatchers :: [Matcher] -> [[Matcher]]
groupedMatchers [] = []
groupedMatchers (x:xs) = (x:ys) : groupedMatchers zs
where
(ys, zs) = span (\y -> matcherPrefix y == And) xs
matcherPrefix :: Matcher -> MatcherPrefix
matcherPrefix (RecordMatcher prefix _) = prefix
matcherPrefix (FieldMatcher prefix _ _) = prefix
-- | Render a field assignment's template, possibly interpolating referenced
-- CSV field values. Outer whitespace is removed from interpolated values.
renderTemplate :: CsvRules -> CsvRecord -> FieldTemplate -> Text
renderTemplate rules record t = maybe t mconcat $ parseMaybe
(many $ takeWhile1P Nothing (/='%')
renderTemplate :: CsvRules -> CsvRecord -> HledgerFieldName -> FieldTemplate -> Text
renderTemplate rules record f t = maybe t mconcat $ parseMaybe
(many $ takeWhile1P Nothing isNotEscapeChar
<|> replaceRegexGroupReference rules record f <$> matchp
<|> replaceCsvFieldReference rules record <$> referencep)
t
where
-- XXX: can we return a parsed Int here?
matchp = liftA2 T.cons (char '\\') (takeWhile1P (Just "matchref") isDigit) :: Parsec HledgerParseErrorData Text Text
referencep = liftA2 T.cons (char '%') (takeWhile1P (Just "reference") isFieldNameChar) :: Parsec HledgerParseErrorData Text Text
isFieldNameChar c = isAlphaNum c || c == '_' || c == '-'
isNotEscapeChar c = c /='%' && c /= '\\'
-- | Replace something that looks like a Regex match group reference with the
-- resulting match group value after applying the Regex.
replaceRegexGroupReference :: CsvRules -> CsvRecord -> HledgerFieldName -> FieldTemplate -> Text
replaceRegexGroupReference rules record f s = case T.uncons s of
Just ('\\', group) -> fromMaybe "" $ regexMatchValue rules record f group
_ -> s
regexMatchValue :: CsvRules -> CsvRecord -> HledgerFieldName -> Text -> Maybe Text
regexMatchValue rules record f sgroup = let
matchgroups = concatMap (getMatchGroups rules record)
$ concatMap cbMatchers
$ filter (isBlockActive rules record)
$ rblocksassigning rules f
group = (read (T.unpack sgroup) :: Int) - 1 -- adjust to 0-indexing
in atMay matchgroups group
getMatchGroups :: CsvRules -> CsvRecord -> Matcher -> [Text]
getMatchGroups _ record (RecordMatcher _ regex) = let
txt = T.intercalate "," record -- see caveats of wholecsvline, in `isBlockActive`
in regexMatchTextGroups regex txt
getMatchGroups rules record (FieldMatcher _ fieldref regex) = let
txt = replaceCsvFieldReference rules record fieldref
in regexMatchTextGroups regex txt
-- | Replace something that looks like a reference to a csv field ("%date" or "%1)
-- with that field's value. If it doesn't look like a field reference, or if we
@ -1159,7 +1188,7 @@ getAmount rules record currency p1IsVirtual n =
-- assignments to any of these field names with non-empty values
assignments = [(f,a') | f <- fieldnames
, Just v <- [T.strip . renderTemplate rules record <$> hledgerField rules record f]
, Just v <- [T.strip . renderTemplate rules record f <$> hledgerField rules record f]
, not $ T.null v
-- XXX maybe ignore rule-generated values like "", "-", "$", "-$", "$-" ? cf CSV FORMAT -> "amount", "Setting amounts",
, let a = parseAmount rules record currency v