fix!: csv: skip now counts non-blank lines more robustly (fix #2024)

Inner empty lines were not being skipped automatically, contrary to docs. Now all empty lines are skipped automatically, and the `skip` rule is needed only for non-empty lines, as intended. This may be a breaking change: it's possible that the `skip` count might need to be adjusted in some CSV rules files.
2023-05-11 16:37:37 -10:00 · 2023-05-11 16:37:37 -10:00 · 577e4b6347
commit 577e4b6347
parent 69be1d4ef7
3 changed files with 98 additions and 67 deletions
--- a/hledger-lib/Hledger/Read/CsvReader.hs
+++ b/hledger-lib/Hledger/Read/CsvReader.hs
@ -86,6 +86,13 @@ type CSV       = [CsvRecord]
 type CsvRecord = [CsvValue]
 type CsvValue  = Text
 -- ** utils
 printCSV :: [CsvRecord] -> TL.Text
 printCSV = TB.toLazyText . unlinesB . map printRecord
    where printRecord = foldMap TB.fromText . intersperse "," . map printField
          printField = wrap "\"" "\"" . T.replace "\"" "\"\""
 --- ** reader
 reader :: MonadIO m => Reader m
@ -687,8 +694,10 @@ regexp end = do
 --
 readJournalFromCsv :: Maybe FilePath -> FilePath -> Text -> ExceptT String IO Journal
 readJournalFromCsv Nothing "-" _ = throwError "please use --rules-file when reading CSV from stdin"
-readJournalFromCsv mrulesfile csvfile csvdata = do
+readJournalFromCsv mrulesfile csvfile csvtext = do
-    -- parse the csv rules
+    -- for now, correctness is the priority here, efficiency not so much
    -- get the csv rules as text
    let rulesfile = fromMaybe (rulesFileFor csvfile) mrulesfile
    rulesfileexists <- liftIO $ doesFileExist rulesfile
    rulestext <- liftIO $ if rulesfileexists
@ -697,27 +706,24 @@ readJournalFromCsv mrulesfile csvfile csvdata = do
        readFilePortably rulesfile >>= expandIncludes (takeDirectory rulesfile)
      else
        return $ defaultRulesText rulesfile
    -- parse the csv rules
    rules <- liftEither $ parseAndValidateCsvRules rulesfile rulestext
    dbg6IO "csv rules" rules
-    mtzin <- case getDirective "timezone" rules of
+    -- convert the csv data to lines and remove all empty/blank lines
-              Nothing -> return Nothing
+    let csvlines1 = dbg9 "csvlines1" $ filter (not . T.null . T.strip) $ dbg9 "csvlines0" $ T.lines csvtext
              Just s  ->
                maybe (throwError $ "could not parse time zone: " ++ T.unpack s) (return.Just) $
                parseTimeM False defaultTimeLocale "%Z" $ T.unpack s
    tzout <- liftIO getCurrentTimeZone
-    -- skip header lines, if there is a top-level skip rule
+    -- if there is a top-level skip rule, skip the specified number of non-empty lines
    skiplines <- case getDirective "skip" rules of
                      Nothing -> return 0
                      Just "" -> return 1
                      Just s  -> maybe (throwError $ "could not parse skip value: " ++ show s) return . readMay $ T.unpack s
-    let csvdata' = T.unlines $ drop skiplines $ T.lines csvdata
+    let csvlines2 = dbg9 "csvlines2" $ drop skiplines csvlines1
-    -- parse csv
+    -- convert back to text and parse as csv records
    let
-      -- parsec seems to fail if you pass it "-" here TODO: try again with megaparsec
+      csvtext1 = T.unlines csvlines2
      parsecfilename = if csvfile == "-" then "(stdin)" else csvfile
      separator =
        case getDirective "separator" rules >>= parseSeparator of
          Just c           -> c
@ -726,15 +732,27 @@ readJournalFromCsv mrulesfile csvfile csvdata = do
          _                -> ','
          where
            ext = map toLower $ drop 1 $ takeExtension csvfile
      -- parsec seemed to fail if you pass it "-" here   -- TODO: try again with megaparsec
      parsecfilename = if csvfile == "-" then "(stdin)" else csvfile
    dbg6IO "using separator" separator
-    csv <- dbg7 "parseCsv" <$> parseCsv separator parsecfilename csvdata'
+    -- parse csv records
-    records <- liftEither $ dbg7 "validateCsv" <$> validateCsv rules csv
+    csvrecords0 <- dbg7 "parseCsv" <$> parseCsv separator parsecfilename csvtext1
-    dbg6IO "first 3 csv records" $ take 3 records
+    -- remove any records skipped by conditional skip or end rules
    let csvrecords1 = applyConditionalSkips rules csvrecords0
    -- and check the remaining records for any obvious problems
    csvrecords <- liftEither $ dbg7 "validateCsv" <$> validateCsv csvrecords1
    dbg6IO "first 3 csv records" $ take 3 csvrecords
-    -- identify header lines
+    -- XXX identify header lines some day ?
-    -- let (headerlines, datalines) = identifyHeaderLines records
+    -- let (headerlines, datalines) = identifyHeaderLines csvrecords'
    --     mfieldnames = lastMay headerlines
    tzout <- liftIO getCurrentTimeZone
    mtzin <- case getDirective "timezone" rules of
              Nothing -> return Nothing
              Just s  ->
                maybe (throwError $ "could not parse time zone: " ++ T.unpack s) (return.Just) $
                parseTimeM False defaultTimeLocale "%Z" $ T.unpack s
    let
      -- convert CSV records to transactions, saving the CSV line numbers for error positions
      txns = dbg7 "csv txns" $ snd $ mapAccumL
@ -746,7 +764,7 @@ readJournalFromCsv mrulesfile csvfile csvdata = do
                        in
                          (pos', transactionFromCsvRecord timesarezoned mtzin tzout pos rules r)
                     )
-                     (initialPos parsecfilename) records
+                     (initialPos parsecfilename) csvrecords
        where
          timesarezoned =
            case csvRule rules "date-format" of
@ -793,62 +811,55 @@ parseSeparator = specials . T.toLower
 -- Call parseCassava on a file or stdin, converting the result to ExceptT.
 parseCsv :: Char -> FilePath -> Text -> ExceptT String IO [CsvRecord]
-parseCsv separator filePath csvdata = ExceptT $
+parseCsv separator filePath csvtext = ExceptT $
  case filePath of
    "-" -> parseCassava separator "(stdin)" <$> T.getContents
-    _   -> return $ if T.null csvdata then Right mempty else parseCassava separator filePath csvdata
+    _   -> return $ if T.null csvtext then Right mempty else parseCassava separator filePath csvtext
 -- Parse text into CSV records, using Cassava and the given field separator.
 parseCassava :: Char -> FilePath -> Text -> Either String [CsvRecord]
 parseCassava separator path content =
  -- XXX we now remove all blank lines before parsing; will Cassava will still produce [""] records ?
  -- filter (/=[""])
  either (Left . errorBundlePretty) (Right . parseResultToCsv) <$>
-  CassavaMegaparsec.decodeWith (decodeOptions separator) Cassava.NoHeader path $
+  CassavaMegaparsec.decodeWith decodeOptions Cassava.NoHeader path $
  BL.fromStrict $ T.encodeUtf8 content
-
+  where
-decodeOptions :: Char -> Cassava.DecodeOptions
+    decodeOptions = Cassava.defaultDecodeOptions {
 decodeOptions separator = Cassava.defaultDecodeOptions {
                      Cassava.decDelimiter = fromIntegral (ord separator)
                    }
-
+    parseResultToCsv :: (Foldable t, Functor t) => t (t B.ByteString) -> [CsvRecord]
-parseResultToCsv :: (Foldable t, Functor t) => t (t B.ByteString) -> [CsvRecord]
+    parseResultToCsv = toListList . unpackFields
-parseResultToCsv = toListList . unpackFields
+      where
    where
        toListList = toList . fmap toList
        unpackFields  = (fmap . fmap) T.decodeUtf8
-printCSV :: [CsvRecord] -> TL.Text
+-- | Scan for csv records where a conditional `skip` or `end` rule applies,
-printCSV = TB.toLazyText . unlinesB . map printRecord
+-- and apply that rule, removing one or more following records.
-    where printRecord = foldMap TB.fromText . intersperse "," . map printField
+applyConditionalSkips :: CsvRules -> [CsvRecord] -> [CsvRecord]
-          printField = wrap "\"" "\"" . T.replace "\"" "\"\""
+applyConditionalSkips _ [] = []
-
+applyConditionalSkips rules (r:rest) =
-- | Do some cleanup and validation on the parsed CSV records.
+  case skipnum r of
-- Cleanups: filter out empty ([""]) records
+    Nothing -> r : applyConditionalSkips rules rest
-- 
+    Just cnt -> applyConditionalSkips rules $ drop (cnt-1) rest
 -- * 
 --
 -- Return the cleaned up and validated CSV data (can be empty), or an error.
 validateCsv :: CsvRules -> [CsvRecord] -> Either String [CsvRecord]
 validateCsv rules = validate . applyConditionalSkips . filternulls
  where
-    filternulls = filter (/=[""])
+    skipnum r1 =
-    applyConditionalSkips [] = []
+      case (getEffectiveAssignment rules r1 "end", getEffectiveAssignment rules r1 "skip") of
-    applyConditionalSkips (r:rest) =
+        (Nothing, Nothing) -> Nothing
-      case skipnum r of
+        (Just _, _) -> Just maxBound
-        Nothing -> r:(applyConditionalSkips rest)
+        (Nothing, Just "") -> Just 1
-        Just cnt -> applyConditionalSkips (drop (cnt-1) rest)
+        (Nothing, Just x) -> Just (read $ T.unpack x)
-      where
+
-        skipnum r1 =
+-- | Do some validation on the parsed CSV records:
-          case (getEffectiveAssignment rules r1 "end", getEffectiveAssignment rules r1 "skip") of
+-- check that they all have at least two fields.
-            (Nothing, Nothing) -> Nothing
+validateCsv :: [CsvRecord] -> Either String [CsvRecord]
-            (Just _, _) -> Just maxBound
+validateCsv [] = Right []
-            (Nothing, Just "") -> Just 1
+validateCsv rs@(_first:_) =
-            (Nothing, Just x) -> Just (read $ T.unpack x)
+  case lessthan2 of
-    validate [] = Right []
+    Just r  -> Left $ printf "CSV record %s has less than two fields" (show r)
-    validate rs@(_first:_) = case lessthan2 of
+    Nothing -> Right rs
-        Just r  -> Left $ printf "CSV record %s has less than two fields" (show r)
+  where
-        Nothing -> Right rs
+    lessthan2 = headMay $ filter ((<2).length) rs
      where
        lessthan2 = headMay $ filter ((<2).length) rs
 -- -- | The highest (0-based) field index referenced in the field
 -- -- definitions, or -1 if no fields are defined.
--- a/hledger/hledger.m4.md
+++ b/hledger/hledger.m4.md
@ -2964,13 +2964,12 @@ skip N
 ```
 The word `skip` followed by a number (or no number, meaning 1)
 tells hledger to ignore this many non-empty lines at the start of the input data.
 (Empty/blank lines are skipped automatically, so you don't need to count those.)
 You'll need this whenever your CSV data contains header lines.
-Header lines skipped in this way are ignored, and not parsed as CSV.
+Note, empty and blank lines are skipped automatically, so you don't need to count those.
-`skip` can also be used inside [if blocks](#if-block) (described below),
+`skip` has a second meaning: it can be used inside [if blocks](#if-block) (described below),
-to skip individual data records.
+to skip one or more records whenever the condition is true.
-Note records skipped in this way are still required to be [valid CSV](#valid-csv), even though otherwise ignored.
+Records skipped in this way are ignored, except they are still required to be [valid CSV](#valid-csv).
 ## `date-format`
--- a/hledger/test/csv.test
+++ b/hledger/test/csv.test
@ -1059,7 +1059,28 @@ fields date, b, c
 $  ./csvtest.sh
 >=
-# 54. Some validation is done on account name assignments; trying to
+# 54. Empty (zero length) or blank (containing only spaces, tabs, etc.) lines
 # are skipped automatically, including inner ones; skip's argument
 # counts only the non-empty/blank lines.
 <
 title
 date, amount
 2023-01-01, 1
 RULES
 skip 2
 fields date, amount
 $  ./csvtest.sh
 2023-01-01
    expenses:unknown               1
    income:unknown                -1
 >=
 # 55. Some validation is done on account name assignments; trying to
 # also set an amount there (with 2+ spaces) will be rejected. (#1978)
 <
 2022-01-01,1