fix!: csv: skip now counts non-blank lines more robustly (fix #2024)

Inner empty lines were not being skipped automatically, contrary to
docs. Now all empty lines are skipped automatically, and the `skip`
rule is needed only for non-empty lines, as intended.

This may be a breaking change: it's possible that the `skip` count
might need to be adjusted in some CSV rules files.
This commit is contained in:
Simon Michael 2023-05-11 16:37:37 -10:00
parent 69be1d4ef7
commit 577e4b6347
3 changed files with 98 additions and 67 deletions

View File

@ -86,6 +86,13 @@ type CSV = [CsvRecord]
type CsvRecord = [CsvValue] type CsvRecord = [CsvValue]
type CsvValue = Text type CsvValue = Text
-- ** utils
printCSV :: [CsvRecord] -> TL.Text
printCSV = TB.toLazyText . unlinesB . map printRecord
where printRecord = foldMap TB.fromText . intersperse "," . map printField
printField = wrap "\"" "\"" . T.replace "\"" "\"\""
--- ** reader --- ** reader
reader :: MonadIO m => Reader m reader :: MonadIO m => Reader m
@ -687,8 +694,10 @@ regexp end = do
-- --
readJournalFromCsv :: Maybe FilePath -> FilePath -> Text -> ExceptT String IO Journal readJournalFromCsv :: Maybe FilePath -> FilePath -> Text -> ExceptT String IO Journal
readJournalFromCsv Nothing "-" _ = throwError "please use --rules-file when reading CSV from stdin" readJournalFromCsv Nothing "-" _ = throwError "please use --rules-file when reading CSV from stdin"
readJournalFromCsv mrulesfile csvfile csvdata = do readJournalFromCsv mrulesfile csvfile csvtext = do
-- parse the csv rules -- for now, correctness is the priority here, efficiency not so much
-- get the csv rules as text
let rulesfile = fromMaybe (rulesFileFor csvfile) mrulesfile let rulesfile = fromMaybe (rulesFileFor csvfile) mrulesfile
rulesfileexists <- liftIO $ doesFileExist rulesfile rulesfileexists <- liftIO $ doesFileExist rulesfile
rulestext <- liftIO $ if rulesfileexists rulestext <- liftIO $ if rulesfileexists
@ -697,27 +706,24 @@ readJournalFromCsv mrulesfile csvfile csvdata = do
readFilePortably rulesfile >>= expandIncludes (takeDirectory rulesfile) readFilePortably rulesfile >>= expandIncludes (takeDirectory rulesfile)
else else
return $ defaultRulesText rulesfile return $ defaultRulesText rulesfile
-- parse the csv rules
rules <- liftEither $ parseAndValidateCsvRules rulesfile rulestext rules <- liftEither $ parseAndValidateCsvRules rulesfile rulestext
dbg6IO "csv rules" rules dbg6IO "csv rules" rules
mtzin <- case getDirective "timezone" rules of -- convert the csv data to lines and remove all empty/blank lines
Nothing -> return Nothing let csvlines1 = dbg9 "csvlines1" $ filter (not . T.null . T.strip) $ dbg9 "csvlines0" $ T.lines csvtext
Just s ->
maybe (throwError $ "could not parse time zone: " ++ T.unpack s) (return.Just) $
parseTimeM False defaultTimeLocale "%Z" $ T.unpack s
tzout <- liftIO getCurrentTimeZone
-- skip header lines, if there is a top-level skip rule -- if there is a top-level skip rule, skip the specified number of non-empty lines
skiplines <- case getDirective "skip" rules of skiplines <- case getDirective "skip" rules of
Nothing -> return 0 Nothing -> return 0
Just "" -> return 1 Just "" -> return 1
Just s -> maybe (throwError $ "could not parse skip value: " ++ show s) return . readMay $ T.unpack s Just s -> maybe (throwError $ "could not parse skip value: " ++ show s) return . readMay $ T.unpack s
let csvdata' = T.unlines $ drop skiplines $ T.lines csvdata let csvlines2 = dbg9 "csvlines2" $ drop skiplines csvlines1
-- parse csv -- convert back to text and parse as csv records
let let
-- parsec seems to fail if you pass it "-" here TODO: try again with megaparsec csvtext1 = T.unlines csvlines2
parsecfilename = if csvfile == "-" then "(stdin)" else csvfile
separator = separator =
case getDirective "separator" rules >>= parseSeparator of case getDirective "separator" rules >>= parseSeparator of
Just c -> c Just c -> c
@ -726,15 +732,27 @@ readJournalFromCsv mrulesfile csvfile csvdata = do
_ -> ',' _ -> ','
where where
ext = map toLower $ drop 1 $ takeExtension csvfile ext = map toLower $ drop 1 $ takeExtension csvfile
-- parsec seemed to fail if you pass it "-" here -- TODO: try again with megaparsec
parsecfilename = if csvfile == "-" then "(stdin)" else csvfile
dbg6IO "using separator" separator dbg6IO "using separator" separator
csv <- dbg7 "parseCsv" <$> parseCsv separator parsecfilename csvdata' -- parse csv records
records <- liftEither $ dbg7 "validateCsv" <$> validateCsv rules csv csvrecords0 <- dbg7 "parseCsv" <$> parseCsv separator parsecfilename csvtext1
dbg6IO "first 3 csv records" $ take 3 records -- remove any records skipped by conditional skip or end rules
let csvrecords1 = applyConditionalSkips rules csvrecords0
-- and check the remaining records for any obvious problems
csvrecords <- liftEither $ dbg7 "validateCsv" <$> validateCsv csvrecords1
dbg6IO "first 3 csv records" $ take 3 csvrecords
-- identify header lines -- XXX identify header lines some day ?
-- let (headerlines, datalines) = identifyHeaderLines records -- let (headerlines, datalines) = identifyHeaderLines csvrecords'
-- mfieldnames = lastMay headerlines -- mfieldnames = lastMay headerlines
tzout <- liftIO getCurrentTimeZone
mtzin <- case getDirective "timezone" rules of
Nothing -> return Nothing
Just s ->
maybe (throwError $ "could not parse time zone: " ++ T.unpack s) (return.Just) $
parseTimeM False defaultTimeLocale "%Z" $ T.unpack s
let let
-- convert CSV records to transactions, saving the CSV line numbers for error positions -- convert CSV records to transactions, saving the CSV line numbers for error positions
txns = dbg7 "csv txns" $ snd $ mapAccumL txns = dbg7 "csv txns" $ snd $ mapAccumL
@ -746,7 +764,7 @@ readJournalFromCsv mrulesfile csvfile csvdata = do
in in
(pos', transactionFromCsvRecord timesarezoned mtzin tzout pos rules r) (pos', transactionFromCsvRecord timesarezoned mtzin tzout pos rules r)
) )
(initialPos parsecfilename) records (initialPos parsecfilename) csvrecords
where where
timesarezoned = timesarezoned =
case csvRule rules "date-format" of case csvRule rules "date-format" of
@ -793,62 +811,55 @@ parseSeparator = specials . T.toLower
-- Call parseCassava on a file or stdin, converting the result to ExceptT. -- Call parseCassava on a file or stdin, converting the result to ExceptT.
parseCsv :: Char -> FilePath -> Text -> ExceptT String IO [CsvRecord] parseCsv :: Char -> FilePath -> Text -> ExceptT String IO [CsvRecord]
parseCsv separator filePath csvdata = ExceptT $ parseCsv separator filePath csvtext = ExceptT $
case filePath of case filePath of
"-" -> parseCassava separator "(stdin)" <$> T.getContents "-" -> parseCassava separator "(stdin)" <$> T.getContents
_ -> return $ if T.null csvdata then Right mempty else parseCassava separator filePath csvdata _ -> return $ if T.null csvtext then Right mempty else parseCassava separator filePath csvtext
-- Parse text into CSV records, using Cassava and the given field separator. -- Parse text into CSV records, using Cassava and the given field separator.
parseCassava :: Char -> FilePath -> Text -> Either String [CsvRecord] parseCassava :: Char -> FilePath -> Text -> Either String [CsvRecord]
parseCassava separator path content = parseCassava separator path content =
-- XXX we now remove all blank lines before parsing; will Cassava will still produce [""] records ?
-- filter (/=[""])
either (Left . errorBundlePretty) (Right . parseResultToCsv) <$> either (Left . errorBundlePretty) (Right . parseResultToCsv) <$>
CassavaMegaparsec.decodeWith (decodeOptions separator) Cassava.NoHeader path $ CassavaMegaparsec.decodeWith decodeOptions Cassava.NoHeader path $
BL.fromStrict $ T.encodeUtf8 content BL.fromStrict $ T.encodeUtf8 content
where
decodeOptions :: Char -> Cassava.DecodeOptions decodeOptions = Cassava.defaultDecodeOptions {
decodeOptions separator = Cassava.defaultDecodeOptions {
Cassava.decDelimiter = fromIntegral (ord separator) Cassava.decDelimiter = fromIntegral (ord separator)
} }
parseResultToCsv :: (Foldable t, Functor t) => t (t B.ByteString) -> [CsvRecord]
parseResultToCsv :: (Foldable t, Functor t) => t (t B.ByteString) -> [CsvRecord] parseResultToCsv = toListList . unpackFields
parseResultToCsv = toListList . unpackFields where
where
toListList = toList . fmap toList toListList = toList . fmap toList
unpackFields = (fmap . fmap) T.decodeUtf8 unpackFields = (fmap . fmap) T.decodeUtf8
printCSV :: [CsvRecord] -> TL.Text -- | Scan for csv records where a conditional `skip` or `end` rule applies,
printCSV = TB.toLazyText . unlinesB . map printRecord -- and apply that rule, removing one or more following records.
where printRecord = foldMap TB.fromText . intersperse "," . map printField applyConditionalSkips :: CsvRules -> [CsvRecord] -> [CsvRecord]
printField = wrap "\"" "\"" . T.replace "\"" "\"\"" applyConditionalSkips _ [] = []
applyConditionalSkips rules (r:rest) =
-- | Do some cleanup and validation on the parsed CSV records. case skipnum r of
-- Cleanups: filter out empty ([""]) records Nothing -> r : applyConditionalSkips rules rest
-- Just cnt -> applyConditionalSkips rules $ drop (cnt-1) rest
-- *
--
-- Return the cleaned up and validated CSV data (can be empty), or an error.
validateCsv :: CsvRules -> [CsvRecord] -> Either String [CsvRecord]
validateCsv rules = validate . applyConditionalSkips . filternulls
where where
filternulls = filter (/=[""]) skipnum r1 =
applyConditionalSkips [] = [] case (getEffectiveAssignment rules r1 "end", getEffectiveAssignment rules r1 "skip") of
applyConditionalSkips (r:rest) = (Nothing, Nothing) -> Nothing
case skipnum r of (Just _, _) -> Just maxBound
Nothing -> r:(applyConditionalSkips rest) (Nothing, Just "") -> Just 1
Just cnt -> applyConditionalSkips (drop (cnt-1) rest) (Nothing, Just x) -> Just (read $ T.unpack x)
where
skipnum r1 = -- | Do some validation on the parsed CSV records:
case (getEffectiveAssignment rules r1 "end", getEffectiveAssignment rules r1 "skip") of -- check that they all have at least two fields.
(Nothing, Nothing) -> Nothing validateCsv :: [CsvRecord] -> Either String [CsvRecord]
(Just _, _) -> Just maxBound validateCsv [] = Right []
(Nothing, Just "") -> Just 1 validateCsv rs@(_first:_) =
(Nothing, Just x) -> Just (read $ T.unpack x) case lessthan2 of
validate [] = Right [] Just r -> Left $ printf "CSV record %s has less than two fields" (show r)
validate rs@(_first:_) = case lessthan2 of Nothing -> Right rs
Just r -> Left $ printf "CSV record %s has less than two fields" (show r) where
Nothing -> Right rs lessthan2 = headMay $ filter ((<2).length) rs
where
lessthan2 = headMay $ filter ((<2).length) rs
-- -- | The highest (0-based) field index referenced in the field -- -- | The highest (0-based) field index referenced in the field
-- -- definitions, or -1 if no fields are defined. -- -- definitions, or -1 if no fields are defined.

View File

@ -2964,13 +2964,12 @@ skip N
``` ```
The word `skip` followed by a number (or no number, meaning 1) The word `skip` followed by a number (or no number, meaning 1)
tells hledger to ignore this many non-empty lines at the start of the input data. tells hledger to ignore this many non-empty lines at the start of the input data.
(Empty/blank lines are skipped automatically, so you don't need to count those.)
You'll need this whenever your CSV data contains header lines. You'll need this whenever your CSV data contains header lines.
Header lines skipped in this way are ignored, and not parsed as CSV. Note, empty and blank lines are skipped automatically, so you don't need to count those.
`skip` can also be used inside [if blocks](#if-block) (described below), `skip` has a second meaning: it can be used inside [if blocks](#if-block) (described below),
to skip individual data records. to skip one or more records whenever the condition is true.
Note records skipped in this way are still required to be [valid CSV](#valid-csv), even though otherwise ignored. Records skipped in this way are ignored, except they are still required to be [valid CSV](#valid-csv).
## `date-format` ## `date-format`

View File

@ -1059,7 +1059,28 @@ fields date, b, c
$ ./csvtest.sh $ ./csvtest.sh
>= >=
# 54. Some validation is done on account name assignments; trying to # 54. Empty (zero length) or blank (containing only spaces, tabs, etc.) lines
# are skipped automatically, including inner ones; skip's argument
# counts only the non-empty/blank lines.
<
title
date, amount
2023-01-01, 1
RULES
skip 2
fields date, amount
$ ./csvtest.sh
2023-01-01
expenses:unknown 1
income:unknown -1
>=
# 55. Some validation is done on account name assignments; trying to
# also set an amount there (with 2+ spaces) will be rejected. (#1978) # also set an amount there (with 2+ spaces) will be rejected. (#1978)
< <
2022-01-01,1 2022-01-01,1