From 8bfa382c689cdebb26ccef32915bb993adfc84e1 Mon Sep 17 00:00:00 2001
From: Jonathan Dowland <jon@dow.land>
Date: Mon, 30 Oct 2023 11:10:35 +0000
Subject: [PATCH] feat: import: interpolate regex matches in field templates
 (#2009)

Replace occurrences of '\N' (where N is a positive number) in field
templates with the corresponding regular expression match group, if it
exists.

E.g. Warp the date to the first of the month for the second posting

    if %date (....-..)-..
        comment2 date:\1-01

E.g. Strip a prefix from an imported account name

    if %account1 liabilities:jon:(.*)
        account1 \1

Fixes #2009.

Signed-off-by: Jonathan Dowland <jon@dow.land>
---
 hledger-lib/Hledger/Read/RulesReader.hs | 127 +++++++++++++++---------
 1 file changed, 78 insertions(+), 49 deletions(-)

diff --git a/hledger-lib/Hledger/Read/RulesReader.hs b/hledger-lib/Hledger/Read/RulesReader.hs
index 09cac4a91..6dd890508 100644
--- a/hledger-lib/Hledger/Read/RulesReader.hs
+++ b/hledger-lib/Hledger/Read/RulesReader.hs
@@ -372,10 +372,12 @@ JOURNAL-PSEUDO-FIELD: amount-in | amount-out | currency
 
 ASSIGNMENT-SEPARATOR: SPACE | ( : SPACE? )
 
-FIELD-VALUE: VALUE (possibly containing CSV-FIELD-REFERENCEs)
+FIELD-VALUE: VALUE (possibly containing CSV-FIELD-REFERENCEs and REGEX-MATCHGROUP-REFERENCEs)
 
 CSV-FIELD-REFERENCE: % CSV-FIELD
 
+REGEX-MATCHGROUP-REFERENCE: \ DIGIT+
+
 CSV-FIELD: ( FIELD-NAME | FIELD-NUMBER ) (corresponding to a CSV field)
 
 FIELD-NUMBER: DIGIT+
@@ -707,7 +709,7 @@ hledgerField = getEffectiveAssignment
 -- | Look up the final value assigned to a hledger field, with csv field
 -- references interpolated.
 hledgerFieldValue :: CsvRules -> CsvRecord -> HledgerFieldName -> Maybe Text
-hledgerFieldValue rules record = fmap (renderTemplate rules record) . hledgerField rules record
+hledgerFieldValue rules record f = (fmap (renderTemplate rules record f) . hledgerField rules record) f
 
 maybeNegate :: MatcherPrefix -> Bool -> Bool
 maybeNegate Not origbool = not origbool
@@ -725,61 +727,88 @@ getEffectiveAssignment rules record f = lastMay $ map snd $ assignments
   where
     -- all active assignments to field f, in order
     assignments = dbg9 "csv assignments" $ filter ((==f).fst) $ toplevelassignments ++ conditionalassignments
-      where
-        -- all top level field assignments
-        toplevelassignments    = rassignments rules
-        -- all field assignments in conditional blocks assigning to field f and active for the current csv record
-        conditionalassignments = concatMap cbAssignments $ filter isBlockActive $ (rblocksassigning rules) f
-          where
-            -- does this conditional block match the current csv record ?
-            isBlockActive :: ConditionalBlock -> Bool
-            isBlockActive CB{..} = any (all matcherMatches) $ groupedMatchers cbMatchers
-              where
-                -- does this individual matcher match the current csv record ?
-                matcherMatches :: Matcher -> Bool
-                matcherMatches (RecordMatcher prefix pat) = maybeNegate prefix origbool
-                  where
-                    pat' = dbg7 "regex" pat
-                    -- A synthetic whole CSV record to match against. Note, this can be
-                    -- different from the original CSV data:
-                    -- - any whitespace surrounding field values is preserved
-                    -- - any quotes enclosing field values are removed
-                    -- - and the field separator is always comma
-                    -- which means that a field containing a comma will look like two fields.
-                    wholecsvline = dbg7 "wholecsvline" $ T.intercalate "," record
-                    origbool = regexMatchText pat' wholecsvline
-                matcherMatches (FieldMatcher prefix csvfieldref pat) = maybeNegate prefix origbool
-                  where
-                    -- the value of the referenced CSV field to match against.
-                    csvfieldvalue = dbg7 "csvfieldvalue" $ replaceCsvFieldReference rules record csvfieldref
-                    origbool = regexMatchText pat csvfieldvalue
+    -- all top level field assignments
+    toplevelassignments    = rassignments rules 
+    -- all field assignments in conditional blocks assigning to field f and active for the current csv record
+    conditionalassignments = concatMap cbAssignments $ filter (isBlockActive rules record) $ (rblocksassigning rules) f
 
-                -- | Group matchers into associative pairs based on prefix, e.g.:
-                --   A
-                --   & B
-                --   C
-                --   D
-                --   & E
-                --   => [[A, B], [C], [D, E]]
-                groupedMatchers :: [Matcher] -> [[Matcher]]
-                groupedMatchers [] = []
-                groupedMatchers (x:xs) = (x:ys) : groupedMatchers zs
-                  where
-                    (ys, zs) = span (\y -> matcherPrefix y == And) xs
-                    matcherPrefix :: Matcher -> MatcherPrefix
-                    matcherPrefix (RecordMatcher prefix _) = prefix
-                    matcherPrefix (FieldMatcher prefix _ _) = prefix
+-- does this conditional block match the current csv record ?
+isBlockActive :: CsvRules -> CsvRecord -> ConditionalBlock -> Bool
+isBlockActive rules record CB{..} = any (all matcherMatches) $ groupedMatchers cbMatchers
+  where
+    -- does this individual matcher match the current csv record ?
+    matcherMatches :: Matcher -> Bool
+    matcherMatches (RecordMatcher prefix pat) = maybeNegate prefix origbool
+      where
+        pat' = dbg7 "regex" pat
+        -- A synthetic whole CSV record to match against. Note, this can be
+        -- different from the original CSV data:
+        -- - any whitespace surrounding field values is preserved
+        -- - any quotes enclosing field values are removed
+        -- - and the field separator is always comma
+        -- which means that a field containing a comma will look like two fields.
+        wholecsvline = dbg7 "wholecsvline" $ T.intercalate "," record
+        origbool = regexMatchText pat' wholecsvline
+    matcherMatches (FieldMatcher prefix csvfieldref pat) = maybeNegate prefix origbool
+      where
+        -- the value of the referenced CSV field to match against.
+        csvfieldvalue = dbg7 "csvfieldvalue" $ replaceCsvFieldReference rules record csvfieldref
+        origbool = regexMatchText pat csvfieldvalue
+
+    -- | Group matchers into associative pairs based on prefix, e.g.:
+    --   A
+    --   & B
+    --   C
+    --   D
+    --   & E
+    --   => [[A, B], [C], [D, E]]
+    groupedMatchers :: [Matcher] -> [[Matcher]]
+    groupedMatchers [] = []
+    groupedMatchers (x:xs) = (x:ys) : groupedMatchers zs
+      where
+        (ys, zs) = span (\y -> matcherPrefix y == And) xs
+        matcherPrefix :: Matcher -> MatcherPrefix
+        matcherPrefix (RecordMatcher prefix _) = prefix
+        matcherPrefix (FieldMatcher prefix _ _) = prefix
 
 -- | Render a field assignment's template, possibly interpolating referenced
 -- CSV field values. Outer whitespace is removed from interpolated values.
-renderTemplate ::  CsvRules -> CsvRecord -> FieldTemplate -> Text
-renderTemplate rules record t = maybe t mconcat $ parseMaybe
-    (many $ takeWhile1P Nothing (/='%')
+renderTemplate ::  CsvRules -> CsvRecord -> HledgerFieldName -> FieldTemplate -> Text
+renderTemplate rules record f t = maybe t mconcat $ parseMaybe
+    (many $ takeWhile1P Nothing isNotEscapeChar
+        <|> replaceRegexGroupReference rules record f <$> matchp
         <|> replaceCsvFieldReference rules record <$> referencep)
     t
   where
+    -- XXX: can we return a parsed Int here?
+    matchp = liftA2 T.cons (char '\\') (takeWhile1P (Just "matchref") isDigit) :: Parsec HledgerParseErrorData Text Text
     referencep = liftA2 T.cons (char '%') (takeWhile1P (Just "reference") isFieldNameChar) :: Parsec HledgerParseErrorData Text Text
     isFieldNameChar c = isAlphaNum c || c == '_' || c == '-'
+    isNotEscapeChar c = c /='%' && c /= '\\'
+
+-- | Replace something that looks like a Regex match group reference with the
+-- resulting match group value after applying the Regex.
+replaceRegexGroupReference :: CsvRules -> CsvRecord -> HledgerFieldName -> FieldTemplate -> Text
+replaceRegexGroupReference rules record f s = case T.uncons s of
+    Just ('\\', group) -> fromMaybe "" $ regexMatchValue rules record f group
+    _                  -> s
+
+regexMatchValue :: CsvRules -> CsvRecord -> HledgerFieldName -> Text -> Maybe Text
+regexMatchValue rules record f sgroup = let
+  matchgroups  = concatMap (getMatchGroups rules record)
+               $ concatMap cbMatchers
+               $ filter (isBlockActive rules record)
+               $ rblocksassigning rules f
+  group = (read (T.unpack sgroup) :: Int) - 1 -- adjust to 0-indexing
+  in atMay matchgroups group
+
+getMatchGroups :: CsvRules -> CsvRecord -> Matcher -> [Text]
+getMatchGroups _ record (RecordMatcher _ regex)  = let
+  txt = T.intercalate "," record -- see caveats of wholecsvline, in `isBlockActive`
+  in regexMatchTextGroups regex txt
+getMatchGroups rules record (FieldMatcher _ fieldref regex) = let
+  txt = replaceCsvFieldReference rules record fieldref
+  in regexMatchTextGroups regex txt
 
 -- | Replace something that looks like a reference to a csv field ("%date" or "%1)
 -- with that field's value. If it doesn't look like a field reference, or if we
@@ -1159,7 +1188,7 @@ getAmount rules record currency p1IsVirtual n =
 
     -- assignments to any of these field names with non-empty values
     assignments = [(f,a') | f <- fieldnames
-                          , Just v <- [T.strip . renderTemplate rules record <$> hledgerField rules record f]
+                          , Just v <- [T.strip . renderTemplate rules record f <$> hledgerField rules record f]
                           , not $ T.null v
                           -- XXX maybe ignore rule-generated values like "", "-", "$", "-$", "$-" ? cf CSV FORMAT -> "amount", "Setting amounts",
                           , let a = parseAmount rules record currency v