fix!: csv: skip now counts non-blank lines more robustly (fix #2024)

Inner empty lines were not being skipped automatically, contrary to docs. Now all empty lines are skipped automatically, and the `skip` rule is needed only for non-empty lines, as intended. This may be a breaking change: it's possible that the `skip` count might need to be adjusted in some CSV rules files.
2023-05-11 16:37:37 -10:00 · 2023-05-11 16:37:37 -10:00 · 577e4b6347
commit 577e4b6347
parent 69be1d4ef7
3 changed files with 98 additions and 67 deletions
--- a/hledger-lib/Hledger/Read/CsvReader.hs
+++ b/hledger-lib/Hledger/Read/CsvReader.hs
@ -86,6 +86,13 @@ type CSV       = [CsvRecord]
 type CsvRecord = [CsvValue]
 type CsvValue  = Text

+-- ** utils
+
+printCSV :: [CsvRecord] -> TL.Text
+printCSV = TB.toLazyText . unlinesB . map printRecord
+    where printRecord = foldMap TB.fromText . intersperse "," . map printField
+          printField = wrap "\"" "\"" . T.replace "\"" "\"\""
+
 --- ** reader

 reader :: MonadIO m => Reader m
@ -687,8 +694,10 @@ regexp end = do
 --
 readJournalFromCsv :: Maybe FilePath -> FilePath -> Text -> ExceptT String IO Journal
 readJournalFromCsv Nothing "-" _ = throwError "please use --rules-file when reading CSV from stdin"
-readJournalFromCsv mrulesfile csvfile csvdata = do
-    -- parse the csv rules
+readJournalFromCsv mrulesfile csvfile csvtext = do
+    -- for now, correctness is the priority here, efficiency not so much
+
+    -- get the csv rules as text
    let rulesfile = fromMaybe (rulesFileFor csvfile) mrulesfile
    rulesfileexists <- liftIO $ doesFileExist rulesfile
    rulestext <- liftIO $ if rulesfileexists
@ -697,27 +706,24 @@ readJournalFromCsv mrulesfile csvfile csvdata = do
        readFilePortably rulesfile >>= expandIncludes (takeDirectory rulesfile)
      else
        return $ defaultRulesText rulesfile
+
+    -- parse the csv rules
    rules <- liftEither $ parseAndValidateCsvRules rulesfile rulestext
    dbg6IO "csv rules" rules

-    mtzin <- case getDirective "timezone" rules of
-              Nothing -> return Nothing
-              Just s  ->
-                maybe (throwError $ "could not parse time zone: " ++ T.unpack s) (return.Just) $
-                parseTimeM False defaultTimeLocale "%Z" $ T.unpack s
-    tzout <- liftIO getCurrentTimeZone
+    -- convert the csv data to lines and remove all empty/blank lines
+    let csvlines1 = dbg9 "csvlines1" $ filter (not . T.null . T.strip) $ dbg9 "csvlines0" $ T.lines csvtext

-    -- skip header lines, if there is a top-level skip rule
+    -- if there is a top-level skip rule, skip the specified number of non-empty lines
    skiplines <- case getDirective "skip" rules of
                      Nothing -> return 0
                      Just "" -> return 1
                      Just s  -> maybe (throwError $ "could not parse skip value: " ++ show s) return . readMay $ T.unpack s
-    let csvdata' = T.unlines $ drop skiplines $ T.lines csvdata
+    let csvlines2 = dbg9 "csvlines2" $ drop skiplines csvlines1

-    -- parse csv
+    -- convert back to text and parse as csv records
    let
-      -- parsec seems to fail if you pass it "-" here TODO: try again with megaparsec
-      parsecfilename = if csvfile == "-" then "(stdin)" else csvfile
+      csvtext1 = T.unlines csvlines2
      separator =
        case getDirective "separator" rules >>= parseSeparator of
          Just c           -> c
@ -726,15 +732,27 @@ readJournalFromCsv mrulesfile csvfile csvdata = do
          _                -> ','
          where
            ext = map toLower $ drop 1 $ takeExtension csvfile
+      -- parsec seemed to fail if you pass it "-" here   -- TODO: try again with megaparsec
+      parsecfilename = if csvfile == "-" then "(stdin)" else csvfile
    dbg6IO "using separator" separator
-    csv <- dbg7 "parseCsv" <$> parseCsv separator parsecfilename csvdata'
-    records <- liftEither $ dbg7 "validateCsv" <$> validateCsv rules csv
-    dbg6IO "first 3 csv records" $ take 3 records
+    -- parse csv records
+    csvrecords0 <- dbg7 "parseCsv" <$> parseCsv separator parsecfilename csvtext1
+    -- remove any records skipped by conditional skip or end rules
+    let csvrecords1 = applyConditionalSkips rules csvrecords0
+    -- and check the remaining records for any obvious problems
+    csvrecords <- liftEither $ dbg7 "validateCsv" <$> validateCsv csvrecords1
+    dbg6IO "first 3 csv records" $ take 3 csvrecords

-    -- identify header lines
-    -- let (headerlines, datalines) = identifyHeaderLines records
+    -- XXX identify header lines some day ?
+    -- let (headerlines, datalines) = identifyHeaderLines csvrecords'
    --     mfieldnames = lastMay headerlines

+    tzout <- liftIO getCurrentTimeZone
+    mtzin <- case getDirective "timezone" rules of
+              Nothing -> return Nothing
+              Just s  ->
+                maybe (throwError $ "could not parse time zone: " ++ T.unpack s) (return.Just) $
+                parseTimeM False defaultTimeLocale "%Z" $ T.unpack s
    let
      -- convert CSV records to transactions, saving the CSV line numbers for error positions
      txns = dbg7 "csv txns" $ snd $ mapAccumL
@ -746,7 +764,7 @@ readJournalFromCsv mrulesfile csvfile csvdata = do
                        in
                          (pos', transactionFromCsvRecord timesarezoned mtzin tzout pos rules r)
                     )
-                     (initialPos parsecfilename) records
+                     (initialPos parsecfilename) csvrecords
        where
          timesarezoned =
            case csvRule rules "date-format" of
@ -793,62 +811,55 @@ parseSeparator = specials . T.toLower

 -- Call parseCassava on a file or stdin, converting the result to ExceptT.
 parseCsv :: Char -> FilePath -> Text -> ExceptT String IO [CsvRecord]
-parseCsv separator filePath csvdata = ExceptT $
+parseCsv separator filePath csvtext = ExceptT $
  case filePath of
    "-" -> parseCassava separator "(stdin)" <$> T.getContents
-    _   -> return $ if T.null csvdata then Right mempty else parseCassava separator filePath csvdata
+    _   -> return $ if T.null csvtext then Right mempty else parseCassava separator filePath csvtext

 -- Parse text into CSV records, using Cassava and the given field separator.
 parseCassava :: Char -> FilePath -> Text -> Either String [CsvRecord]
 parseCassava separator path content =
+  -- XXX we now remove all blank lines before parsing; will Cassava will still produce [""] records ?
+  -- filter (/=[""])
  either (Left . errorBundlePretty) (Right . parseResultToCsv) <$>
-  CassavaMegaparsec.decodeWith (decodeOptions separator) Cassava.NoHeader path $
+  CassavaMegaparsec.decodeWith decodeOptions Cassava.NoHeader path $
  BL.fromStrict $ T.encodeUtf8 content
-
-decodeOptions :: Char -> Cassava.DecodeOptions
-decodeOptions separator = Cassava.defaultDecodeOptions {
+  where
+    decodeOptions = Cassava.defaultDecodeOptions {
                      Cassava.decDelimiter = fromIntegral (ord separator)
                    }
-
-parseResultToCsv :: (Foldable t, Functor t) => t (t B.ByteString) -> [CsvRecord]
-parseResultToCsv = toListList . unpackFields
-    where
+    parseResultToCsv :: (Foldable t, Functor t) => t (t B.ByteString) -> [CsvRecord]
+    parseResultToCsv = toListList . unpackFields
+      where
        toListList = toList . fmap toList
        unpackFields  = (fmap . fmap) T.decodeUtf8

-printCSV :: [CsvRecord] -> TL.Text
-printCSV = TB.toLazyText . unlinesB . map printRecord
-    where printRecord = foldMap TB.fromText . intersperse "," . map printField
-          printField = wrap "\"" "\"" . T.replace "\"" "\"\""
-
-- | Do some cleanup and validation on the parsed CSV records.
-- Cleanups: filter out empty ([""]) records
-- 
-- * 
--
-- Return the cleaned up and validated CSV data (can be empty), or an error.
-validateCsv :: CsvRules -> [CsvRecord] -> Either String [CsvRecord]
-validateCsv rules = validate . applyConditionalSkips . filternulls
+-- | Scan for csv records where a conditional `skip` or `end` rule applies,
+-- and apply that rule, removing one or more following records.
+applyConditionalSkips :: CsvRules -> [CsvRecord] -> [CsvRecord]
+applyConditionalSkips _ [] = []
+applyConditionalSkips rules (r:rest) =
+  case skipnum r of
+    Nothing -> r : applyConditionalSkips rules rest
+    Just cnt -> applyConditionalSkips rules $ drop (cnt-1) rest
  where
-    filternulls = filter (/=[""])
-    applyConditionalSkips [] = []
-    applyConditionalSkips (r:rest) =
-      case skipnum r of
-        Nothing -> r:(applyConditionalSkips rest)
-        Just cnt -> applyConditionalSkips (drop (cnt-1) rest)
-      where
-        skipnum r1 =
-          case (getEffectiveAssignment rules r1 "end", getEffectiveAssignment rules r1 "skip") of
-            (Nothing, Nothing) -> Nothing
-            (Just _, _) -> Just maxBound
-            (Nothing, Just "") -> Just 1
-            (Nothing, Just x) -> Just (read $ T.unpack x)
-    validate [] = Right []
-    validate rs@(_first:_) = case lessthan2 of
-        Just r  -> Left $ printf "CSV record %s has less than two fields" (show r)
-        Nothing -> Right rs
-      where
-        lessthan2 = headMay $ filter ((<2).length) rs
+    skipnum r1 =
+      case (getEffectiveAssignment rules r1 "end", getEffectiveAssignment rules r1 "skip") of
+        (Nothing, Nothing) -> Nothing
+        (Just _, _) -> Just maxBound
+        (Nothing, Just "") -> Just 1
+        (Nothing, Just x) -> Just (read $ T.unpack x)
+
+-- | Do some validation on the parsed CSV records:
+-- check that they all have at least two fields.
+validateCsv :: [CsvRecord] -> Either String [CsvRecord]
+validateCsv [] = Right []
+validateCsv rs@(_first:_) =
+  case lessthan2 of
+    Just r  -> Left $ printf "CSV record %s has less than two fields" (show r)
+    Nothing -> Right rs
+  where
+    lessthan2 = headMay $ filter ((<2).length) rs

 -- -- | The highest (0-based) field index referenced in the field
 -- -- definitions, or -1 if no fields are defined.
--- a/hledger/hledger.m4.md
+++ b/hledger/hledger.m4.md
@ -2964,13 +2964,12 @@ skip N
 ```
 The word `skip` followed by a number (or no number, meaning 1)
 tells hledger to ignore this many non-empty lines at the start of the input data.
-(Empty/blank lines are skipped automatically, so you don't need to count those.)
 You'll need this whenever your CSV data contains header lines.
-Header lines skipped in this way are ignored, and not parsed as CSV.
+Note, empty and blank lines are skipped automatically, so you don't need to count those.

-`skip` can also be used inside [if blocks](#if-block) (described below),
-to skip individual data records.
-Note records skipped in this way are still required to be [valid CSV](#valid-csv), even though otherwise ignored.
+`skip` has a second meaning: it can be used inside [if blocks](#if-block) (described below),
+to skip one or more records whenever the condition is true.
+Records skipped in this way are ignored, except they are still required to be [valid CSV](#valid-csv).

 ## `date-format`

--- a/hledger/test/csv.test
+++ b/hledger/test/csv.test
@ -1059,7 +1059,28 @@ fields date, b, c
 $  ./csvtest.sh
 >=

-# 54. Some validation is done on account name assignments; trying to
+# 54. Empty (zero length) or blank (containing only spaces, tabs, etc.) lines
+# are skipped automatically, including inner ones; skip's argument
+# counts only the non-empty/blank lines.
+<
+
+ 	
+title
+
+	 
+date, amount
+2023-01-01, 1
+RULES
+skip 2
+fields date, amount
+$  ./csvtest.sh
+2023-01-01
+    expenses:unknown               1
+    income:unknown                -1
+
+>=
+
+# 55. Some validation is done on account name assignments; trying to
 # also set an amount there (with 2+ spaces) will be rejected. (#1978)
 <
 2022-01-01,1