From a9b63bb69447422872c755b413bbac652671fdfe Mon Sep 17 00:00:00 2001
From: Simon Michael <simon@joyful.com>
Date: Tue, 27 Dec 2022 12:21:20 -1000
Subject: [PATCH] fix: csv: skip header lines before attempting to parse
 records (#1967)

---
 hledger-lib/Hledger/Read/CsvReader.hs | 21 +++++++++++----------
 hledger/test/csv.test                 | 23 +++++++++++++++++++++--
 2 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/hledger-lib/Hledger/Read/CsvReader.hs b/hledger-lib/Hledger/Read/CsvReader.hs
index d147684bc..80b286863 100644
--- a/hledger-lib/Hledger/Read/CsvReader.hs
+++ b/hledger-lib/Hledger/Read/CsvReader.hs
@@ -699,12 +699,6 @@ readJournalFromCsv mrulesfile csvfile csvdata = do
     rules <- liftEither $ parseAndValidateCsvRules rulesfile rulestext
     dbg6IO "csv rules" rules
 
-    -- parse the skip directive's value, if any
-    skiplines <- case getDirective "skip" rules of
-                      Nothing -> return 0
-                      Just "" -> return 1
-                      Just s  -> maybe (throwError $ "could not parse skip value: " ++ show s) return . readMay $ T.unpack s
-
     mtzin <- case getDirective "timezone" rules of
               Nothing -> return Nothing
               Just s  ->
@@ -712,6 +706,13 @@ readJournalFromCsv mrulesfile csvfile csvdata = do
                 parseTimeM False defaultTimeLocale "%Z" $ T.unpack s
     tzout <- liftIO getCurrentTimeZone
 
+    -- skip header lines, if there is a top-level skip rule
+    skiplines <- case getDirective "skip" rules of
+                      Nothing -> return 0
+                      Just "" -> return 1
+                      Just s  -> maybe (throwError $ "could not parse skip value: " ++ show s) return . readMay $ T.unpack s
+    let csvdata' = T.unlines $ drop skiplines $ T.lines csvdata
+
     -- parse csv
     let
       -- parsec seems to fail if you pass it "-" here TODO: try again with megaparsec
@@ -725,8 +726,8 @@ readJournalFromCsv mrulesfile csvfile csvdata = do
           where
             ext = map toLower $ drop 1 $ takeExtension csvfile
     dbg6IO "using separator" separator
-    csv <- dbg7 "parseCsv" <$> parseCsv separator parsecfilename csvdata
-    records <- liftEither $ dbg7 "validateCsv" <$> validateCsv rules skiplines csv
+    csv <- dbg7 "parseCsv" <$> parseCsv separator parsecfilename csvdata'
+    records <- liftEither $ dbg7 "validateCsv" <$> validateCsv rules csv
     dbg6IO "first 3 csv records" $ take 3 records
 
     -- identify header lines
@@ -818,8 +819,8 @@ printCSV = TB.toLazyText . unlinesB . map printRecord
           printField = wrap "\"" "\"" . T.replace "\"" "\"\""
 
 -- | Return the cleaned up and validated CSV data (can be empty), or an error.
-validateCsv :: CsvRules -> Int -> CSV -> Either String [CsvRecord]
-validateCsv rules numhdrlines = validate . applyConditionalSkips . drop numhdrlines . filternulls
+validateCsv :: CsvRules -> CSV -> Either String [CsvRecord]
+validateCsv rules = validate . applyConditionalSkips . filternulls
   where
     filternulls = filter (/=[""])
     skipnum r =
diff --git a/hledger/test/csv.test b/hledger/test/csv.test
index 0039e5e65..164920ab8 100644
--- a/hledger/test/csv.test
+++ b/hledger/test/csv.test
@@ -1033,8 +1033,7 @@ $  ./csvtest.sh
 <
 "2021-12-23","caffe_siciliaexpenses:cibo:dolce","-10.5"
 
-RULES file
-
+RULES
 account1 assets:bank:checking
 fields date, description, account2, amount
 
@@ -1042,6 +1041,26 @@ $  ./csvtest.sh
 >2 /transaction is unbalanced/
 >=1
 
+# 52. We can't parse double quotes inside an unquoted field, or other non-RFC4180 data. (#1966)
+<
+2022-01-01,B"B",C
+RULES
+fields date, b, c
+$  ./csvtest.sh
+>2 /unexpected '"'/
+>=1
+
+# 53. A top-level skip directive is able to skip lines which would fail to parse as CSV. (#1967)
+<
+2022-01-01,B"B",C
+RULES
+skip 1
+fields date, b, c
+$  ./csvtest.sh
+>=
+
+
+
 ## . 
 #<
 #$  ./csvtest.sh