From a9b63bb69447422872c755b413bbac652671fdfe Mon Sep 17 00:00:00 2001 From: Simon Michael Date: Tue, 27 Dec 2022 12:21:20 -1000 Subject: [PATCH] fix: csv: skip header lines before attempting to parse records (#1967) --- hledger-lib/Hledger/Read/CsvReader.hs | 21 +++++++++++---------- hledger/test/csv.test | 23 +++++++++++++++++++++-- 2 files changed, 32 insertions(+), 12 deletions(-) diff --git a/hledger-lib/Hledger/Read/CsvReader.hs b/hledger-lib/Hledger/Read/CsvReader.hs index d147684bc..80b286863 100644 --- a/hledger-lib/Hledger/Read/CsvReader.hs +++ b/hledger-lib/Hledger/Read/CsvReader.hs @@ -699,12 +699,6 @@ readJournalFromCsv mrulesfile csvfile csvdata = do rules <- liftEither $ parseAndValidateCsvRules rulesfile rulestext dbg6IO "csv rules" rules - -- parse the skip directive's value, if any - skiplines <- case getDirective "skip" rules of - Nothing -> return 0 - Just "" -> return 1 - Just s -> maybe (throwError $ "could not parse skip value: " ++ show s) return . readMay $ T.unpack s - mtzin <- case getDirective "timezone" rules of Nothing -> return Nothing Just s -> @@ -712,6 +706,13 @@ readJournalFromCsv mrulesfile csvfile csvdata = do parseTimeM False defaultTimeLocale "%Z" $ T.unpack s tzout <- liftIO getCurrentTimeZone + -- skip header lines, if there is a top-level skip rule + skiplines <- case getDirective "skip" rules of + Nothing -> return 0 + Just "" -> return 1 + Just s -> maybe (throwError $ "could not parse skip value: " ++ show s) return . readMay $ T.unpack s + let csvdata' = T.unlines $ drop skiplines $ T.lines csvdata + -- parse csv let -- parsec seems to fail if you pass it "-" here TODO: try again with megaparsec @@ -725,8 +726,8 @@ readJournalFromCsv mrulesfile csvfile csvdata = do where ext = map toLower $ drop 1 $ takeExtension csvfile dbg6IO "using separator" separator - csv <- dbg7 "parseCsv" <$> parseCsv separator parsecfilename csvdata - records <- liftEither $ dbg7 "validateCsv" <$> validateCsv rules skiplines csv + csv <- dbg7 "parseCsv" <$> parseCsv separator parsecfilename csvdata' + records <- liftEither $ dbg7 "validateCsv" <$> validateCsv rules csv dbg6IO "first 3 csv records" $ take 3 records -- identify header lines @@ -818,8 +819,8 @@ printCSV = TB.toLazyText . unlinesB . map printRecord printField = wrap "\"" "\"" . T.replace "\"" "\"\"" -- | Return the cleaned up and validated CSV data (can be empty), or an error. -validateCsv :: CsvRules -> Int -> CSV -> Either String [CsvRecord] -validateCsv rules numhdrlines = validate . applyConditionalSkips . drop numhdrlines . filternulls +validateCsv :: CsvRules -> CSV -> Either String [CsvRecord] +validateCsv rules = validate . applyConditionalSkips . filternulls where filternulls = filter (/=[""]) skipnum r = diff --git a/hledger/test/csv.test b/hledger/test/csv.test index 0039e5e65..164920ab8 100644 --- a/hledger/test/csv.test +++ b/hledger/test/csv.test @@ -1033,8 +1033,7 @@ $ ./csvtest.sh < "2021-12-23","caffe_siciliaexpenses:cibo:dolce","-10.5" -RULES file - +RULES account1 assets:bank:checking fields date, description, account2, amount @@ -1042,6 +1041,26 @@ $ ./csvtest.sh >2 /transaction is unbalanced/ >=1 +# 52. We can't parse double quotes inside an unquoted field, or other non-RFC4180 data. (#1966) +< +2022-01-01,B"B",C +RULES +fields date, b, c +$ ./csvtest.sh +>2 /unexpected '"'/ +>=1 + +# 53. A top-level skip directive is able to skip lines which would fail to parse as CSV. (#1967) +< +2022-01-01,B"B",C +RULES +skip 1 +fields date, b, c +$ ./csvtest.sh +>= + + + ## . #< #$ ./csvtest.sh