From 577e4b6347a0930e523347ad760e2dbfa7835546 Mon Sep 17 00:00:00 2001 From: Simon Michael Date: Thu, 11 May 2023 16:37:37 -1000 Subject: [PATCH] fix!: csv: skip now counts non-blank lines more robustly (fix #2024) Inner empty lines were not being skipped automatically, contrary to docs. Now all empty lines are skipped automatically, and the `skip` rule is needed only for non-empty lines, as intended. This may be a breaking change: it's possible that the `skip` count might need to be adjusted in some CSV rules files. --- hledger-lib/Hledger/Read/CsvReader.hs | 133 ++++++++++++++------------ hledger/hledger.m4.md | 9 +- hledger/test/csv.test | 23 ++++- 3 files changed, 98 insertions(+), 67 deletions(-) diff --git a/hledger-lib/Hledger/Read/CsvReader.hs b/hledger-lib/Hledger/Read/CsvReader.hs index 7f86c578d..bb5f113ca 100644 --- a/hledger-lib/Hledger/Read/CsvReader.hs +++ b/hledger-lib/Hledger/Read/CsvReader.hs @@ -86,6 +86,13 @@ type CSV = [CsvRecord] type CsvRecord = [CsvValue] type CsvValue = Text +-- ** utils + +printCSV :: [CsvRecord] -> TL.Text +printCSV = TB.toLazyText . unlinesB . map printRecord + where printRecord = foldMap TB.fromText . intersperse "," . map printField + printField = wrap "\"" "\"" . T.replace "\"" "\"\"" + --- ** reader reader :: MonadIO m => Reader m @@ -687,8 +694,10 @@ regexp end = do -- readJournalFromCsv :: Maybe FilePath -> FilePath -> Text -> ExceptT String IO Journal readJournalFromCsv Nothing "-" _ = throwError "please use --rules-file when reading CSV from stdin" -readJournalFromCsv mrulesfile csvfile csvdata = do - -- parse the csv rules +readJournalFromCsv mrulesfile csvfile csvtext = do + -- for now, correctness is the priority here, efficiency not so much + + -- get the csv rules as text let rulesfile = fromMaybe (rulesFileFor csvfile) mrulesfile rulesfileexists <- liftIO $ doesFileExist rulesfile rulestext <- liftIO $ if rulesfileexists @@ -697,27 +706,24 @@ readJournalFromCsv mrulesfile csvfile csvdata = do readFilePortably rulesfile >>= expandIncludes (takeDirectory rulesfile) else return $ defaultRulesText rulesfile + + -- parse the csv rules rules <- liftEither $ parseAndValidateCsvRules rulesfile rulestext dbg6IO "csv rules" rules - mtzin <- case getDirective "timezone" rules of - Nothing -> return Nothing - Just s -> - maybe (throwError $ "could not parse time zone: " ++ T.unpack s) (return.Just) $ - parseTimeM False defaultTimeLocale "%Z" $ T.unpack s - tzout <- liftIO getCurrentTimeZone + -- convert the csv data to lines and remove all empty/blank lines + let csvlines1 = dbg9 "csvlines1" $ filter (not . T.null . T.strip) $ dbg9 "csvlines0" $ T.lines csvtext - -- skip header lines, if there is a top-level skip rule + -- if there is a top-level skip rule, skip the specified number of non-empty lines skiplines <- case getDirective "skip" rules of Nothing -> return 0 Just "" -> return 1 Just s -> maybe (throwError $ "could not parse skip value: " ++ show s) return . readMay $ T.unpack s - let csvdata' = T.unlines $ drop skiplines $ T.lines csvdata + let csvlines2 = dbg9 "csvlines2" $ drop skiplines csvlines1 - -- parse csv + -- convert back to text and parse as csv records let - -- parsec seems to fail if you pass it "-" here TODO: try again with megaparsec - parsecfilename = if csvfile == "-" then "(stdin)" else csvfile + csvtext1 = T.unlines csvlines2 separator = case getDirective "separator" rules >>= parseSeparator of Just c -> c @@ -726,15 +732,27 @@ readJournalFromCsv mrulesfile csvfile csvdata = do _ -> ',' where ext = map toLower $ drop 1 $ takeExtension csvfile + -- parsec seemed to fail if you pass it "-" here -- TODO: try again with megaparsec + parsecfilename = if csvfile == "-" then "(stdin)" else csvfile dbg6IO "using separator" separator - csv <- dbg7 "parseCsv" <$> parseCsv separator parsecfilename csvdata' - records <- liftEither $ dbg7 "validateCsv" <$> validateCsv rules csv - dbg6IO "first 3 csv records" $ take 3 records + -- parse csv records + csvrecords0 <- dbg7 "parseCsv" <$> parseCsv separator parsecfilename csvtext1 + -- remove any records skipped by conditional skip or end rules + let csvrecords1 = applyConditionalSkips rules csvrecords0 + -- and check the remaining records for any obvious problems + csvrecords <- liftEither $ dbg7 "validateCsv" <$> validateCsv csvrecords1 + dbg6IO "first 3 csv records" $ take 3 csvrecords - -- identify header lines - -- let (headerlines, datalines) = identifyHeaderLines records + -- XXX identify header lines some day ? + -- let (headerlines, datalines) = identifyHeaderLines csvrecords' -- mfieldnames = lastMay headerlines + tzout <- liftIO getCurrentTimeZone + mtzin <- case getDirective "timezone" rules of + Nothing -> return Nothing + Just s -> + maybe (throwError $ "could not parse time zone: " ++ T.unpack s) (return.Just) $ + parseTimeM False defaultTimeLocale "%Z" $ T.unpack s let -- convert CSV records to transactions, saving the CSV line numbers for error positions txns = dbg7 "csv txns" $ snd $ mapAccumL @@ -746,7 +764,7 @@ readJournalFromCsv mrulesfile csvfile csvdata = do in (pos', transactionFromCsvRecord timesarezoned mtzin tzout pos rules r) ) - (initialPos parsecfilename) records + (initialPos parsecfilename) csvrecords where timesarezoned = case csvRule rules "date-format" of @@ -793,62 +811,55 @@ parseSeparator = specials . T.toLower -- Call parseCassava on a file or stdin, converting the result to ExceptT. parseCsv :: Char -> FilePath -> Text -> ExceptT String IO [CsvRecord] -parseCsv separator filePath csvdata = ExceptT $ +parseCsv separator filePath csvtext = ExceptT $ case filePath of "-" -> parseCassava separator "(stdin)" <$> T.getContents - _ -> return $ if T.null csvdata then Right mempty else parseCassava separator filePath csvdata + _ -> return $ if T.null csvtext then Right mempty else parseCassava separator filePath csvtext -- Parse text into CSV records, using Cassava and the given field separator. parseCassava :: Char -> FilePath -> Text -> Either String [CsvRecord] parseCassava separator path content = + -- XXX we now remove all blank lines before parsing; will Cassava will still produce [""] records ? + -- filter (/=[""]) either (Left . errorBundlePretty) (Right . parseResultToCsv) <$> - CassavaMegaparsec.decodeWith (decodeOptions separator) Cassava.NoHeader path $ + CassavaMegaparsec.decodeWith decodeOptions Cassava.NoHeader path $ BL.fromStrict $ T.encodeUtf8 content - -decodeOptions :: Char -> Cassava.DecodeOptions -decodeOptions separator = Cassava.defaultDecodeOptions { + where + decodeOptions = Cassava.defaultDecodeOptions { Cassava.decDelimiter = fromIntegral (ord separator) } - -parseResultToCsv :: (Foldable t, Functor t) => t (t B.ByteString) -> [CsvRecord] -parseResultToCsv = toListList . unpackFields - where + parseResultToCsv :: (Foldable t, Functor t) => t (t B.ByteString) -> [CsvRecord] + parseResultToCsv = toListList . unpackFields + where toListList = toList . fmap toList unpackFields = (fmap . fmap) T.decodeUtf8 -printCSV :: [CsvRecord] -> TL.Text -printCSV = TB.toLazyText . unlinesB . map printRecord - where printRecord = foldMap TB.fromText . intersperse "," . map printField - printField = wrap "\"" "\"" . T.replace "\"" "\"\"" - --- | Do some cleanup and validation on the parsed CSV records. --- Cleanups: filter out empty ([""]) records --- --- * --- --- Return the cleaned up and validated CSV data (can be empty), or an error. -validateCsv :: CsvRules -> [CsvRecord] -> Either String [CsvRecord] -validateCsv rules = validate . applyConditionalSkips . filternulls +-- | Scan for csv records where a conditional `skip` or `end` rule applies, +-- and apply that rule, removing one or more following records. +applyConditionalSkips :: CsvRules -> [CsvRecord] -> [CsvRecord] +applyConditionalSkips _ [] = [] +applyConditionalSkips rules (r:rest) = + case skipnum r of + Nothing -> r : applyConditionalSkips rules rest + Just cnt -> applyConditionalSkips rules $ drop (cnt-1) rest where - filternulls = filter (/=[""]) - applyConditionalSkips [] = [] - applyConditionalSkips (r:rest) = - case skipnum r of - Nothing -> r:(applyConditionalSkips rest) - Just cnt -> applyConditionalSkips (drop (cnt-1) rest) - where - skipnum r1 = - case (getEffectiveAssignment rules r1 "end", getEffectiveAssignment rules r1 "skip") of - (Nothing, Nothing) -> Nothing - (Just _, _) -> Just maxBound - (Nothing, Just "") -> Just 1 - (Nothing, Just x) -> Just (read $ T.unpack x) - validate [] = Right [] - validate rs@(_first:_) = case lessthan2 of - Just r -> Left $ printf "CSV record %s has less than two fields" (show r) - Nothing -> Right rs - where - lessthan2 = headMay $ filter ((<2).length) rs + skipnum r1 = + case (getEffectiveAssignment rules r1 "end", getEffectiveAssignment rules r1 "skip") of + (Nothing, Nothing) -> Nothing + (Just _, _) -> Just maxBound + (Nothing, Just "") -> Just 1 + (Nothing, Just x) -> Just (read $ T.unpack x) + +-- | Do some validation on the parsed CSV records: +-- check that they all have at least two fields. +validateCsv :: [CsvRecord] -> Either String [CsvRecord] +validateCsv [] = Right [] +validateCsv rs@(_first:_) = + case lessthan2 of + Just r -> Left $ printf "CSV record %s has less than two fields" (show r) + Nothing -> Right rs + where + lessthan2 = headMay $ filter ((<2).length) rs -- -- | The highest (0-based) field index referenced in the field -- -- definitions, or -1 if no fields are defined. diff --git a/hledger/hledger.m4.md b/hledger/hledger.m4.md index 139dbe35d..6cdd23912 100644 --- a/hledger/hledger.m4.md +++ b/hledger/hledger.m4.md @@ -2964,13 +2964,12 @@ skip N ``` The word `skip` followed by a number (or no number, meaning 1) tells hledger to ignore this many non-empty lines at the start of the input data. -(Empty/blank lines are skipped automatically, so you don't need to count those.) You'll need this whenever your CSV data contains header lines. -Header lines skipped in this way are ignored, and not parsed as CSV. +Note, empty and blank lines are skipped automatically, so you don't need to count those. -`skip` can also be used inside [if blocks](#if-block) (described below), -to skip individual data records. -Note records skipped in this way are still required to be [valid CSV](#valid-csv), even though otherwise ignored. +`skip` has a second meaning: it can be used inside [if blocks](#if-block) (described below), +to skip one or more records whenever the condition is true. +Records skipped in this way are ignored, except they are still required to be [valid CSV](#valid-csv). ## `date-format` diff --git a/hledger/test/csv.test b/hledger/test/csv.test index 2c134bb5e..feb9fcf78 100644 --- a/hledger/test/csv.test +++ b/hledger/test/csv.test @@ -1059,7 +1059,28 @@ fields date, b, c $ ./csvtest.sh >= -# 54. Some validation is done on account name assignments; trying to +# 54. Empty (zero length) or blank (containing only spaces, tabs, etc.) lines +# are skipped automatically, including inner ones; skip's argument +# counts only the non-empty/blank lines. +< + + +title + + +date, amount +2023-01-01, 1 +RULES +skip 2 +fields date, amount +$ ./csvtest.sh +2023-01-01 + expenses:unknown 1 + income:unknown -1 + +>= + +# 55. Some validation is done on account name assignments; trying to # also set an amount there (with 2+ spaces) will be rejected. (#1978) < 2022-01-01,1