From 577e4b6347a0930e523347ad760e2dbfa7835546 Mon Sep 17 00:00:00 2001
From: Simon Michael <simon@joyful.com>
Date: Thu, 11 May 2023 16:37:37 -1000
Subject: [PATCH] fix!: csv: skip now counts non-blank lines more robustly (fix
 #2024)

Inner empty lines were not being skipped automatically, contrary to
docs. Now all empty lines are skipped automatically, and the `skip`
rule is needed only for non-empty lines, as intended.

This may be a breaking change: it's possible that the `skip` count
might need to be adjusted in some CSV rules files.
---
 hledger-lib/Hledger/Read/CsvReader.hs | 133 ++++++++++++++------------
 hledger/hledger.m4.md                 |   9 +-
 hledger/test/csv.test                 |  23 ++++-
 3 files changed, 98 insertions(+), 67 deletions(-)

diff --git a/hledger-lib/Hledger/Read/CsvReader.hs b/hledger-lib/Hledger/Read/CsvReader.hs
index 7f86c578d..bb5f113ca 100644
--- a/hledger-lib/Hledger/Read/CsvReader.hs
+++ b/hledger-lib/Hledger/Read/CsvReader.hs
@@ -86,6 +86,13 @@ type CSV       = [CsvRecord]
 type CsvRecord = [CsvValue]
 type CsvValue  = Text
 
+-- ** utils
+
+printCSV :: [CsvRecord] -> TL.Text
+printCSV = TB.toLazyText . unlinesB . map printRecord
+    where printRecord = foldMap TB.fromText . intersperse "," . map printField
+          printField = wrap "\"" "\"" . T.replace "\"" "\"\""
+
 --- ** reader
 
 reader :: MonadIO m => Reader m
@@ -687,8 +694,10 @@ regexp end = do
 --
 readJournalFromCsv :: Maybe FilePath -> FilePath -> Text -> ExceptT String IO Journal
 readJournalFromCsv Nothing "-" _ = throwError "please use --rules-file when reading CSV from stdin"
-readJournalFromCsv mrulesfile csvfile csvdata = do
-    -- parse the csv rules
+readJournalFromCsv mrulesfile csvfile csvtext = do
+    -- for now, correctness is the priority here, efficiency not so much
+
+    -- get the csv rules as text
     let rulesfile = fromMaybe (rulesFileFor csvfile) mrulesfile
     rulesfileexists <- liftIO $ doesFileExist rulesfile
     rulestext <- liftIO $ if rulesfileexists
@@ -697,27 +706,24 @@ readJournalFromCsv mrulesfile csvfile csvdata = do
         readFilePortably rulesfile >>= expandIncludes (takeDirectory rulesfile)
       else
         return $ defaultRulesText rulesfile
+
+    -- parse the csv rules
     rules <- liftEither $ parseAndValidateCsvRules rulesfile rulestext
     dbg6IO "csv rules" rules
 
-    mtzin <- case getDirective "timezone" rules of
-              Nothing -> return Nothing
-              Just s  ->
-                maybe (throwError $ "could not parse time zone: " ++ T.unpack s) (return.Just) $
-                parseTimeM False defaultTimeLocale "%Z" $ T.unpack s
-    tzout <- liftIO getCurrentTimeZone
+    -- convert the csv data to lines and remove all empty/blank lines
+    let csvlines1 = dbg9 "csvlines1" $ filter (not . T.null . T.strip) $ dbg9 "csvlines0" $ T.lines csvtext
 
-    -- skip header lines, if there is a top-level skip rule
+    -- if there is a top-level skip rule, skip the specified number of non-empty lines
     skiplines <- case getDirective "skip" rules of
                       Nothing -> return 0
                       Just "" -> return 1
                       Just s  -> maybe (throwError $ "could not parse skip value: " ++ show s) return . readMay $ T.unpack s
-    let csvdata' = T.unlines $ drop skiplines $ T.lines csvdata
+    let csvlines2 = dbg9 "csvlines2" $ drop skiplines csvlines1
 
-    -- parse csv
+    -- convert back to text and parse as csv records
     let
-      -- parsec seems to fail if you pass it "-" here TODO: try again with megaparsec
-      parsecfilename = if csvfile == "-" then "(stdin)" else csvfile
+      csvtext1 = T.unlines csvlines2
       separator =
         case getDirective "separator" rules >>= parseSeparator of
           Just c           -> c
@@ -726,15 +732,27 @@ readJournalFromCsv mrulesfile csvfile csvdata = do
           _                -> ','
           where
             ext = map toLower $ drop 1 $ takeExtension csvfile
+      -- parsec seemed to fail if you pass it "-" here   -- TODO: try again with megaparsec
+      parsecfilename = if csvfile == "-" then "(stdin)" else csvfile
     dbg6IO "using separator" separator
-    csv <- dbg7 "parseCsv" <$> parseCsv separator parsecfilename csvdata'
-    records <- liftEither $ dbg7 "validateCsv" <$> validateCsv rules csv
-    dbg6IO "first 3 csv records" $ take 3 records
+    -- parse csv records
+    csvrecords0 <- dbg7 "parseCsv" <$> parseCsv separator parsecfilename csvtext1
+    -- remove any records skipped by conditional skip or end rules
+    let csvrecords1 = applyConditionalSkips rules csvrecords0
+    -- and check the remaining records for any obvious problems
+    csvrecords <- liftEither $ dbg7 "validateCsv" <$> validateCsv csvrecords1
+    dbg6IO "first 3 csv records" $ take 3 csvrecords
 
-    -- identify header lines
-    -- let (headerlines, datalines) = identifyHeaderLines records
+    -- XXX identify header lines some day ?
+    -- let (headerlines, datalines) = identifyHeaderLines csvrecords'
     --     mfieldnames = lastMay headerlines
 
+    tzout <- liftIO getCurrentTimeZone
+    mtzin <- case getDirective "timezone" rules of
+              Nothing -> return Nothing
+              Just s  ->
+                maybe (throwError $ "could not parse time zone: " ++ T.unpack s) (return.Just) $
+                parseTimeM False defaultTimeLocale "%Z" $ T.unpack s
     let
       -- convert CSV records to transactions, saving the CSV line numbers for error positions
       txns = dbg7 "csv txns" $ snd $ mapAccumL
@@ -746,7 +764,7 @@ readJournalFromCsv mrulesfile csvfile csvdata = do
                         in
                           (pos', transactionFromCsvRecord timesarezoned mtzin tzout pos rules r)
                      )
-                     (initialPos parsecfilename) records
+                     (initialPos parsecfilename) csvrecords
         where
           timesarezoned =
             case csvRule rules "date-format" of
@@ -793,62 +811,55 @@ parseSeparator = specials . T.toLower
 
 -- Call parseCassava on a file or stdin, converting the result to ExceptT.
 parseCsv :: Char -> FilePath -> Text -> ExceptT String IO [CsvRecord]
-parseCsv separator filePath csvdata = ExceptT $
+parseCsv separator filePath csvtext = ExceptT $
   case filePath of
     "-" -> parseCassava separator "(stdin)" <$> T.getContents
-    _   -> return $ if T.null csvdata then Right mempty else parseCassava separator filePath csvdata
+    _   -> return $ if T.null csvtext then Right mempty else parseCassava separator filePath csvtext
 
 -- Parse text into CSV records, using Cassava and the given field separator.
 parseCassava :: Char -> FilePath -> Text -> Either String [CsvRecord]
 parseCassava separator path content =
+  -- XXX we now remove all blank lines before parsing; will Cassava will still produce [""] records ?
+  -- filter (/=[""])
   either (Left . errorBundlePretty) (Right . parseResultToCsv) <$>
-  CassavaMegaparsec.decodeWith (decodeOptions separator) Cassava.NoHeader path $
+  CassavaMegaparsec.decodeWith decodeOptions Cassava.NoHeader path $
   BL.fromStrict $ T.encodeUtf8 content
-
-decodeOptions :: Char -> Cassava.DecodeOptions
-decodeOptions separator = Cassava.defaultDecodeOptions {
+  where
+    decodeOptions = Cassava.defaultDecodeOptions {
                       Cassava.decDelimiter = fromIntegral (ord separator)
                     }
-
-parseResultToCsv :: (Foldable t, Functor t) => t (t B.ByteString) -> [CsvRecord]
-parseResultToCsv = toListList . unpackFields
-    where
+    parseResultToCsv :: (Foldable t, Functor t) => t (t B.ByteString) -> [CsvRecord]
+    parseResultToCsv = toListList . unpackFields
+      where
         toListList = toList . fmap toList
         unpackFields  = (fmap . fmap) T.decodeUtf8
 
-printCSV :: [CsvRecord] -> TL.Text
-printCSV = TB.toLazyText . unlinesB . map printRecord
-    where printRecord = foldMap TB.fromText . intersperse "," . map printField
-          printField = wrap "\"" "\"" . T.replace "\"" "\"\""
-
--- | Do some cleanup and validation on the parsed CSV records.
--- Cleanups: filter out empty ([""]) records
--- 
--- * 
---
--- Return the cleaned up and validated CSV data (can be empty), or an error.
-validateCsv :: CsvRules -> [CsvRecord] -> Either String [CsvRecord]
-validateCsv rules = validate . applyConditionalSkips . filternulls
+-- | Scan for csv records where a conditional `skip` or `end` rule applies,
+-- and apply that rule, removing one or more following records.
+applyConditionalSkips :: CsvRules -> [CsvRecord] -> [CsvRecord]
+applyConditionalSkips _ [] = []
+applyConditionalSkips rules (r:rest) =
+  case skipnum r of
+    Nothing -> r : applyConditionalSkips rules rest
+    Just cnt -> applyConditionalSkips rules $ drop (cnt-1) rest
   where
-    filternulls = filter (/=[""])
-    applyConditionalSkips [] = []
-    applyConditionalSkips (r:rest) =
-      case skipnum r of
-        Nothing -> r:(applyConditionalSkips rest)
-        Just cnt -> applyConditionalSkips (drop (cnt-1) rest)
-      where
-        skipnum r1 =
-          case (getEffectiveAssignment rules r1 "end", getEffectiveAssignment rules r1 "skip") of
-            (Nothing, Nothing) -> Nothing
-            (Just _, _) -> Just maxBound
-            (Nothing, Just "") -> Just 1
-            (Nothing, Just x) -> Just (read $ T.unpack x)
-    validate [] = Right []
-    validate rs@(_first:_) = case lessthan2 of
-        Just r  -> Left $ printf "CSV record %s has less than two fields" (show r)
-        Nothing -> Right rs
-      where
-        lessthan2 = headMay $ filter ((<2).length) rs
+    skipnum r1 =
+      case (getEffectiveAssignment rules r1 "end", getEffectiveAssignment rules r1 "skip") of
+        (Nothing, Nothing) -> Nothing
+        (Just _, _) -> Just maxBound
+        (Nothing, Just "") -> Just 1
+        (Nothing, Just x) -> Just (read $ T.unpack x)
+
+-- | Do some validation on the parsed CSV records:
+-- check that they all have at least two fields.
+validateCsv :: [CsvRecord] -> Either String [CsvRecord]
+validateCsv [] = Right []
+validateCsv rs@(_first:_) =
+  case lessthan2 of
+    Just r  -> Left $ printf "CSV record %s has less than two fields" (show r)
+    Nothing -> Right rs
+  where
+    lessthan2 = headMay $ filter ((<2).length) rs
 
 -- -- | The highest (0-based) field index referenced in the field
 -- -- definitions, or -1 if no fields are defined.
diff --git a/hledger/hledger.m4.md b/hledger/hledger.m4.md
index 139dbe35d..6cdd23912 100644
--- a/hledger/hledger.m4.md
+++ b/hledger/hledger.m4.md
@@ -2964,13 +2964,12 @@ skip N
 ```
 The word `skip` followed by a number (or no number, meaning 1)
 tells hledger to ignore this many non-empty lines at the start of the input data.
-(Empty/blank lines are skipped automatically, so you don't need to count those.)
 You'll need this whenever your CSV data contains header lines.
-Header lines skipped in this way are ignored, and not parsed as CSV.
+Note, empty and blank lines are skipped automatically, so you don't need to count those.
 
-`skip` can also be used inside [if blocks](#if-block) (described below),
-to skip individual data records.
-Note records skipped in this way are still required to be [valid CSV](#valid-csv), even though otherwise ignored.
+`skip` has a second meaning: it can be used inside [if blocks](#if-block) (described below),
+to skip one or more records whenever the condition is true.
+Records skipped in this way are ignored, except they are still required to be [valid CSV](#valid-csv).
 
 ## `date-format`
 
diff --git a/hledger/test/csv.test b/hledger/test/csv.test
index 2c134bb5e..feb9fcf78 100644
--- a/hledger/test/csv.test
+++ b/hledger/test/csv.test
@@ -1059,7 +1059,28 @@ fields date, b, c
 $  ./csvtest.sh
 >=
 
-# 54. Some validation is done on account name assignments; trying to
+# 54. Empty (zero length) or blank (containing only spaces, tabs, etc.) lines
+# are skipped automatically, including inner ones; skip's argument
+# counts only the non-empty/blank lines.
+<
+
+ 	
+title
+
+	 
+date, amount
+2023-01-01, 1
+RULES
+skip 2
+fields date, amount
+$  ./csvtest.sh
+2023-01-01
+    expenses:unknown               1
+    income:unknown                -1
+
+>=
+
+# 55. Some validation is done on account name assignments; trying to
 # also set an amount there (with 2+ spaces) will be rejected. (#1978)
 <
 2022-01-01,1