fix!: csv: skip now counts non-blank lines more robustly (fix #2024)
Inner empty lines were not being skipped automatically, contrary to docs. Now all empty lines are skipped automatically, and the `skip` rule is needed only for non-empty lines, as intended. This may be a breaking change: it's possible that the `skip` count might need to be adjusted in some CSV rules files.
This commit is contained in:
parent
69be1d4ef7
commit
577e4b6347
@ -86,6 +86,13 @@ type CSV = [CsvRecord]
|
|||||||
type CsvRecord = [CsvValue]
|
type CsvRecord = [CsvValue]
|
||||||
type CsvValue = Text
|
type CsvValue = Text
|
||||||
|
|
||||||
|
-- ** utils
|
||||||
|
|
||||||
|
printCSV :: [CsvRecord] -> TL.Text
|
||||||
|
printCSV = TB.toLazyText . unlinesB . map printRecord
|
||||||
|
where printRecord = foldMap TB.fromText . intersperse "," . map printField
|
||||||
|
printField = wrap "\"" "\"" . T.replace "\"" "\"\""
|
||||||
|
|
||||||
--- ** reader
|
--- ** reader
|
||||||
|
|
||||||
reader :: MonadIO m => Reader m
|
reader :: MonadIO m => Reader m
|
||||||
@ -687,8 +694,10 @@ regexp end = do
|
|||||||
--
|
--
|
||||||
readJournalFromCsv :: Maybe FilePath -> FilePath -> Text -> ExceptT String IO Journal
|
readJournalFromCsv :: Maybe FilePath -> FilePath -> Text -> ExceptT String IO Journal
|
||||||
readJournalFromCsv Nothing "-" _ = throwError "please use --rules-file when reading CSV from stdin"
|
readJournalFromCsv Nothing "-" _ = throwError "please use --rules-file when reading CSV from stdin"
|
||||||
readJournalFromCsv mrulesfile csvfile csvdata = do
|
readJournalFromCsv mrulesfile csvfile csvtext = do
|
||||||
-- parse the csv rules
|
-- for now, correctness is the priority here, efficiency not so much
|
||||||
|
|
||||||
|
-- get the csv rules as text
|
||||||
let rulesfile = fromMaybe (rulesFileFor csvfile) mrulesfile
|
let rulesfile = fromMaybe (rulesFileFor csvfile) mrulesfile
|
||||||
rulesfileexists <- liftIO $ doesFileExist rulesfile
|
rulesfileexists <- liftIO $ doesFileExist rulesfile
|
||||||
rulestext <- liftIO $ if rulesfileexists
|
rulestext <- liftIO $ if rulesfileexists
|
||||||
@ -697,27 +706,24 @@ readJournalFromCsv mrulesfile csvfile csvdata = do
|
|||||||
readFilePortably rulesfile >>= expandIncludes (takeDirectory rulesfile)
|
readFilePortably rulesfile >>= expandIncludes (takeDirectory rulesfile)
|
||||||
else
|
else
|
||||||
return $ defaultRulesText rulesfile
|
return $ defaultRulesText rulesfile
|
||||||
|
|
||||||
|
-- parse the csv rules
|
||||||
rules <- liftEither $ parseAndValidateCsvRules rulesfile rulestext
|
rules <- liftEither $ parseAndValidateCsvRules rulesfile rulestext
|
||||||
dbg6IO "csv rules" rules
|
dbg6IO "csv rules" rules
|
||||||
|
|
||||||
mtzin <- case getDirective "timezone" rules of
|
-- convert the csv data to lines and remove all empty/blank lines
|
||||||
Nothing -> return Nothing
|
let csvlines1 = dbg9 "csvlines1" $ filter (not . T.null . T.strip) $ dbg9 "csvlines0" $ T.lines csvtext
|
||||||
Just s ->
|
|
||||||
maybe (throwError $ "could not parse time zone: " ++ T.unpack s) (return.Just) $
|
|
||||||
parseTimeM False defaultTimeLocale "%Z" $ T.unpack s
|
|
||||||
tzout <- liftIO getCurrentTimeZone
|
|
||||||
|
|
||||||
-- skip header lines, if there is a top-level skip rule
|
-- if there is a top-level skip rule, skip the specified number of non-empty lines
|
||||||
skiplines <- case getDirective "skip" rules of
|
skiplines <- case getDirective "skip" rules of
|
||||||
Nothing -> return 0
|
Nothing -> return 0
|
||||||
Just "" -> return 1
|
Just "" -> return 1
|
||||||
Just s -> maybe (throwError $ "could not parse skip value: " ++ show s) return . readMay $ T.unpack s
|
Just s -> maybe (throwError $ "could not parse skip value: " ++ show s) return . readMay $ T.unpack s
|
||||||
let csvdata' = T.unlines $ drop skiplines $ T.lines csvdata
|
let csvlines2 = dbg9 "csvlines2" $ drop skiplines csvlines1
|
||||||
|
|
||||||
-- parse csv
|
-- convert back to text and parse as csv records
|
||||||
let
|
let
|
||||||
-- parsec seems to fail if you pass it "-" here TODO: try again with megaparsec
|
csvtext1 = T.unlines csvlines2
|
||||||
parsecfilename = if csvfile == "-" then "(stdin)" else csvfile
|
|
||||||
separator =
|
separator =
|
||||||
case getDirective "separator" rules >>= parseSeparator of
|
case getDirective "separator" rules >>= parseSeparator of
|
||||||
Just c -> c
|
Just c -> c
|
||||||
@ -726,15 +732,27 @@ readJournalFromCsv mrulesfile csvfile csvdata = do
|
|||||||
_ -> ','
|
_ -> ','
|
||||||
where
|
where
|
||||||
ext = map toLower $ drop 1 $ takeExtension csvfile
|
ext = map toLower $ drop 1 $ takeExtension csvfile
|
||||||
|
-- parsec seemed to fail if you pass it "-" here -- TODO: try again with megaparsec
|
||||||
|
parsecfilename = if csvfile == "-" then "(stdin)" else csvfile
|
||||||
dbg6IO "using separator" separator
|
dbg6IO "using separator" separator
|
||||||
csv <- dbg7 "parseCsv" <$> parseCsv separator parsecfilename csvdata'
|
-- parse csv records
|
||||||
records <- liftEither $ dbg7 "validateCsv" <$> validateCsv rules csv
|
csvrecords0 <- dbg7 "parseCsv" <$> parseCsv separator parsecfilename csvtext1
|
||||||
dbg6IO "first 3 csv records" $ take 3 records
|
-- remove any records skipped by conditional skip or end rules
|
||||||
|
let csvrecords1 = applyConditionalSkips rules csvrecords0
|
||||||
|
-- and check the remaining records for any obvious problems
|
||||||
|
csvrecords <- liftEither $ dbg7 "validateCsv" <$> validateCsv csvrecords1
|
||||||
|
dbg6IO "first 3 csv records" $ take 3 csvrecords
|
||||||
|
|
||||||
-- identify header lines
|
-- XXX identify header lines some day ?
|
||||||
-- let (headerlines, datalines) = identifyHeaderLines records
|
-- let (headerlines, datalines) = identifyHeaderLines csvrecords'
|
||||||
-- mfieldnames = lastMay headerlines
|
-- mfieldnames = lastMay headerlines
|
||||||
|
|
||||||
|
tzout <- liftIO getCurrentTimeZone
|
||||||
|
mtzin <- case getDirective "timezone" rules of
|
||||||
|
Nothing -> return Nothing
|
||||||
|
Just s ->
|
||||||
|
maybe (throwError $ "could not parse time zone: " ++ T.unpack s) (return.Just) $
|
||||||
|
parseTimeM False defaultTimeLocale "%Z" $ T.unpack s
|
||||||
let
|
let
|
||||||
-- convert CSV records to transactions, saving the CSV line numbers for error positions
|
-- convert CSV records to transactions, saving the CSV line numbers for error positions
|
||||||
txns = dbg7 "csv txns" $ snd $ mapAccumL
|
txns = dbg7 "csv txns" $ snd $ mapAccumL
|
||||||
@ -746,7 +764,7 @@ readJournalFromCsv mrulesfile csvfile csvdata = do
|
|||||||
in
|
in
|
||||||
(pos', transactionFromCsvRecord timesarezoned mtzin tzout pos rules r)
|
(pos', transactionFromCsvRecord timesarezoned mtzin tzout pos rules r)
|
||||||
)
|
)
|
||||||
(initialPos parsecfilename) records
|
(initialPos parsecfilename) csvrecords
|
||||||
where
|
where
|
||||||
timesarezoned =
|
timesarezoned =
|
||||||
case csvRule rules "date-format" of
|
case csvRule rules "date-format" of
|
||||||
@ -793,62 +811,55 @@ parseSeparator = specials . T.toLower
|
|||||||
|
|
||||||
-- Call parseCassava on a file or stdin, converting the result to ExceptT.
|
-- Call parseCassava on a file or stdin, converting the result to ExceptT.
|
||||||
parseCsv :: Char -> FilePath -> Text -> ExceptT String IO [CsvRecord]
|
parseCsv :: Char -> FilePath -> Text -> ExceptT String IO [CsvRecord]
|
||||||
parseCsv separator filePath csvdata = ExceptT $
|
parseCsv separator filePath csvtext = ExceptT $
|
||||||
case filePath of
|
case filePath of
|
||||||
"-" -> parseCassava separator "(stdin)" <$> T.getContents
|
"-" -> parseCassava separator "(stdin)" <$> T.getContents
|
||||||
_ -> return $ if T.null csvdata then Right mempty else parseCassava separator filePath csvdata
|
_ -> return $ if T.null csvtext then Right mempty else parseCassava separator filePath csvtext
|
||||||
|
|
||||||
-- Parse text into CSV records, using Cassava and the given field separator.
|
-- Parse text into CSV records, using Cassava and the given field separator.
|
||||||
parseCassava :: Char -> FilePath -> Text -> Either String [CsvRecord]
|
parseCassava :: Char -> FilePath -> Text -> Either String [CsvRecord]
|
||||||
parseCassava separator path content =
|
parseCassava separator path content =
|
||||||
|
-- XXX we now remove all blank lines before parsing; will Cassava will still produce [""] records ?
|
||||||
|
-- filter (/=[""])
|
||||||
either (Left . errorBundlePretty) (Right . parseResultToCsv) <$>
|
either (Left . errorBundlePretty) (Right . parseResultToCsv) <$>
|
||||||
CassavaMegaparsec.decodeWith (decodeOptions separator) Cassava.NoHeader path $
|
CassavaMegaparsec.decodeWith decodeOptions Cassava.NoHeader path $
|
||||||
BL.fromStrict $ T.encodeUtf8 content
|
BL.fromStrict $ T.encodeUtf8 content
|
||||||
|
where
|
||||||
decodeOptions :: Char -> Cassava.DecodeOptions
|
decodeOptions = Cassava.defaultDecodeOptions {
|
||||||
decodeOptions separator = Cassava.defaultDecodeOptions {
|
|
||||||
Cassava.decDelimiter = fromIntegral (ord separator)
|
Cassava.decDelimiter = fromIntegral (ord separator)
|
||||||
}
|
}
|
||||||
|
parseResultToCsv :: (Foldable t, Functor t) => t (t B.ByteString) -> [CsvRecord]
|
||||||
parseResultToCsv :: (Foldable t, Functor t) => t (t B.ByteString) -> [CsvRecord]
|
parseResultToCsv = toListList . unpackFields
|
||||||
parseResultToCsv = toListList . unpackFields
|
where
|
||||||
where
|
|
||||||
toListList = toList . fmap toList
|
toListList = toList . fmap toList
|
||||||
unpackFields = (fmap . fmap) T.decodeUtf8
|
unpackFields = (fmap . fmap) T.decodeUtf8
|
||||||
|
|
||||||
printCSV :: [CsvRecord] -> TL.Text
|
-- | Scan for csv records where a conditional `skip` or `end` rule applies,
|
||||||
printCSV = TB.toLazyText . unlinesB . map printRecord
|
-- and apply that rule, removing one or more following records.
|
||||||
where printRecord = foldMap TB.fromText . intersperse "," . map printField
|
applyConditionalSkips :: CsvRules -> [CsvRecord] -> [CsvRecord]
|
||||||
printField = wrap "\"" "\"" . T.replace "\"" "\"\""
|
applyConditionalSkips _ [] = []
|
||||||
|
applyConditionalSkips rules (r:rest) =
|
||||||
-- | Do some cleanup and validation on the parsed CSV records.
|
case skipnum r of
|
||||||
-- Cleanups: filter out empty ([""]) records
|
Nothing -> r : applyConditionalSkips rules rest
|
||||||
--
|
Just cnt -> applyConditionalSkips rules $ drop (cnt-1) rest
|
||||||
-- *
|
|
||||||
--
|
|
||||||
-- Return the cleaned up and validated CSV data (can be empty), or an error.
|
|
||||||
validateCsv :: CsvRules -> [CsvRecord] -> Either String [CsvRecord]
|
|
||||||
validateCsv rules = validate . applyConditionalSkips . filternulls
|
|
||||||
where
|
where
|
||||||
filternulls = filter (/=[""])
|
skipnum r1 =
|
||||||
applyConditionalSkips [] = []
|
case (getEffectiveAssignment rules r1 "end", getEffectiveAssignment rules r1 "skip") of
|
||||||
applyConditionalSkips (r:rest) =
|
(Nothing, Nothing) -> Nothing
|
||||||
case skipnum r of
|
(Just _, _) -> Just maxBound
|
||||||
Nothing -> r:(applyConditionalSkips rest)
|
(Nothing, Just "") -> Just 1
|
||||||
Just cnt -> applyConditionalSkips (drop (cnt-1) rest)
|
(Nothing, Just x) -> Just (read $ T.unpack x)
|
||||||
where
|
|
||||||
skipnum r1 =
|
-- | Do some validation on the parsed CSV records:
|
||||||
case (getEffectiveAssignment rules r1 "end", getEffectiveAssignment rules r1 "skip") of
|
-- check that they all have at least two fields.
|
||||||
(Nothing, Nothing) -> Nothing
|
validateCsv :: [CsvRecord] -> Either String [CsvRecord]
|
||||||
(Just _, _) -> Just maxBound
|
validateCsv [] = Right []
|
||||||
(Nothing, Just "") -> Just 1
|
validateCsv rs@(_first:_) =
|
||||||
(Nothing, Just x) -> Just (read $ T.unpack x)
|
case lessthan2 of
|
||||||
validate [] = Right []
|
Just r -> Left $ printf "CSV record %s has less than two fields" (show r)
|
||||||
validate rs@(_first:_) = case lessthan2 of
|
Nothing -> Right rs
|
||||||
Just r -> Left $ printf "CSV record %s has less than two fields" (show r)
|
where
|
||||||
Nothing -> Right rs
|
lessthan2 = headMay $ filter ((<2).length) rs
|
||||||
where
|
|
||||||
lessthan2 = headMay $ filter ((<2).length) rs
|
|
||||||
|
|
||||||
-- -- | The highest (0-based) field index referenced in the field
|
-- -- | The highest (0-based) field index referenced in the field
|
||||||
-- -- definitions, or -1 if no fields are defined.
|
-- -- definitions, or -1 if no fields are defined.
|
||||||
|
|||||||
@ -2964,13 +2964,12 @@ skip N
|
|||||||
```
|
```
|
||||||
The word `skip` followed by a number (or no number, meaning 1)
|
The word `skip` followed by a number (or no number, meaning 1)
|
||||||
tells hledger to ignore this many non-empty lines at the start of the input data.
|
tells hledger to ignore this many non-empty lines at the start of the input data.
|
||||||
(Empty/blank lines are skipped automatically, so you don't need to count those.)
|
|
||||||
You'll need this whenever your CSV data contains header lines.
|
You'll need this whenever your CSV data contains header lines.
|
||||||
Header lines skipped in this way are ignored, and not parsed as CSV.
|
Note, empty and blank lines are skipped automatically, so you don't need to count those.
|
||||||
|
|
||||||
`skip` can also be used inside [if blocks](#if-block) (described below),
|
`skip` has a second meaning: it can be used inside [if blocks](#if-block) (described below),
|
||||||
to skip individual data records.
|
to skip one or more records whenever the condition is true.
|
||||||
Note records skipped in this way are still required to be [valid CSV](#valid-csv), even though otherwise ignored.
|
Records skipped in this way are ignored, except they are still required to be [valid CSV](#valid-csv).
|
||||||
|
|
||||||
## `date-format`
|
## `date-format`
|
||||||
|
|
||||||
|
|||||||
@ -1059,7 +1059,28 @@ fields date, b, c
|
|||||||
$ ./csvtest.sh
|
$ ./csvtest.sh
|
||||||
>=
|
>=
|
||||||
|
|
||||||
# 54. Some validation is done on account name assignments; trying to
|
# 54. Empty (zero length) or blank (containing only spaces, tabs, etc.) lines
|
||||||
|
# are skipped automatically, including inner ones; skip's argument
|
||||||
|
# counts only the non-empty/blank lines.
|
||||||
|
<
|
||||||
|
|
||||||
|
|
||||||
|
title
|
||||||
|
|
||||||
|
|
||||||
|
date, amount
|
||||||
|
2023-01-01, 1
|
||||||
|
RULES
|
||||||
|
skip 2
|
||||||
|
fields date, amount
|
||||||
|
$ ./csvtest.sh
|
||||||
|
2023-01-01
|
||||||
|
expenses:unknown 1
|
||||||
|
income:unknown -1
|
||||||
|
|
||||||
|
>=
|
||||||
|
|
||||||
|
# 55. Some validation is done on account name assignments; trying to
|
||||||
# also set an amount there (with 2+ spaces) will be rejected. (#1978)
|
# also set an amount there (with 2+ spaces) will be rejected. (#1978)
|
||||||
<
|
<
|
||||||
2022-01-01,1
|
2022-01-01,1
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user