lib: Hledger.Read.CsvReader cleanup

Exports added: CsvRecord, CsvValue, csvFileFor Exports removed: expandIncludes, parseAndValidateCsvRules, transactionFromCsvRecord
2020-02-27 23:28:33 -08:00 · 2020-02-27 23:28:33 -08:00 · 2c8a6e988f
commit 2c8a6e988f
parent 5365fde3bc
2 changed files with 354 additions and 343 deletions
--- a/hledger-lib/Hledger/Read/CsvReader.hs
+++ b/hledger-lib/Hledger/Read/CsvReader.hs
@ -1,3 +1,6 @@
+-- * -*- eval: (orgstruct-mode 1); orgstruct-heading-prefix-regexp:"-- "; -*-
+-- ** doc
+-- In Emacs, use TAB on lines beginning with "-- *" to collapse/expand sections.
 {-|

 A reader for CSV data, using an extra rules file to help interpret the data.
@ -7,36 +10,40 @@ A reader for CSV data, using an extra rules file to help interpret the data.
 -- Here's a command that will render them:
 -- stack haddock hledger-lib --fast --no-haddock-deps --haddock-arguments='--ignore-all-exports' --open

+-- ** language
 {-# LANGUAGE CPP #-}
 {-# LANGUAGE FlexibleContexts #-}
-{-# LANGUAGE ScopedTypeVariables #-}
-{-# LANGUAGE TypeFamilies #-}
-{-# LANGUAGE OverloadedStrings #-}
-{-# LANGUAGE ViewPatterns #-}
-{-# LANGUAGE TypeSynonymInstances #-}
 {-# LANGUAGE FlexibleInstances #-}
-{-# LANGUAGE PackageImports #-}
 {-# LANGUAGE MultiWayIf #-}
 {-# LANGUAGE NamedFieldPuns #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE PackageImports #-}
 {-# LANGUAGE RecordWildCards #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+{-# LANGUAGE TypeFamilies #-}
+{-# LANGUAGE TypeSynonymInstances #-}
+{-# LANGUAGE ViewPatterns #-}

+-- ** doctest setup
+-- $setup
+-- >>> :set -XOverloadedStrings
+
+-- ** exports
 module Hledger.Read.CsvReader (
  -- * Reader
  reader,
  -- * Misc.
-  CsvRecord,
-  CSV, Record, Field,
-  -- rules,
+  CSV, CsvRecord, CsvValue,
+  csvFileFor,
  rulesFileFor,
  parseRulesFile,
-  parseAndValidateCsvRules,
-  expandIncludes,
-  transactionFromCsvRecord,
  printCSV,
  -- * Tests
  tests_CsvReader,
 )
 where
+
+-- ** imports
 import Prelude ()
 import "base-compat-batteries" Prelude.Compat hiding (fail)
 import qualified "base-compat-batteries" Control.Monad.Fail.Compat as Fail (fail)
@ -80,11 +87,13 @@ import Hledger.Data
 import Hledger.Utils
 import Hledger.Read.Common (Reader(..),InputOpts(..),amountp, statusp, genericSourcePos, finaliseJournal)

-type CSV = [Record]
+-- ** some types

-type Record = [Field]
+type CSV       = [CsvRecord]
+type CsvRecord = [CsvValue]
+type CsvValue  = String

-type Field = String
+-- ** reader

 reader :: Reader
 reader = Reader
@ -110,185 +119,23 @@ parse iopts f t = do
                -- better preemptively reverse them once more. XXX inefficient
                pj' = journalReverse pj

-- | Parse special separator names TAB and SPACE, or return the first
-- character. Return Nothing on empty string
-parseSeparator :: String -> Maybe Char
-parseSeparator = specials . map toLower
-  where specials "space" = Just ' '
-        specials "tab"   = Just '\t'
-        specials (x:_)   = Just x
-        specials []      = Nothing
+-- ** reading rules files
+-- *** rules utilities

-- | Read a Journal from the given CSV data (and filename, used for error
-- messages), or return an error. Proceed as follows:
--
-- 1. parse CSV conversion rules from the specified rules file, or from
--    the default rules file for the specified CSV file, if it exists,
--    or throw a parse error; if it doesn't exist, use built-in default rules
--
-- 2. parse the CSV data, or throw a parse error
--
-- 3. convert the CSV records to transactions using the rules
--
-- 4. if the rules file didn't exist, create it with the default rules and filename
--
-- 5. return the transactions as a Journal
-- 
-readJournalFromCsv :: Maybe FilePath -> FilePath -> Text -> IO (Either String Journal)
-readJournalFromCsv Nothing "-" _ = return $ Left "please use --rules-file when reading CSV from stdin"
-readJournalFromCsv mrulesfile csvfile csvdata =
- handle (\(e::IOException) -> return $ Left $ show e) $ do
+-- Not used by hledger; just for lib users, 
+-- | An pure-exception-throwing IO action that parses this file's content
+-- as CSV conversion rules, interpolating any included files first,
+-- and runs some extra validation checks.
+parseRulesFile :: FilePath -> ExceptT String IO CsvRules
+parseRulesFile f =
+  liftIO (readFilePortably f >>= expandIncludes (takeDirectory f))
+    >>= either throwError return . parseAndValidateCsvRules f

-  -- make and throw an IO exception.. which we catch and convert to an Either above ?
-  let throwerr = throw . userError
-
-  -- parse the csv rules
-  let rulesfile = fromMaybe (rulesFileFor csvfile) mrulesfile
-  rulesfileexists <- doesFileExist rulesfile
-  rulestext <-
-    if rulesfileexists
-    then do
-      dbg1IO "using conversion rules file" rulesfile
-      readFilePortably rulesfile >>= expandIncludes (takeDirectory rulesfile)
-    else
-      return $ defaultRulesText rulesfile
-  rules <- either throwerr return $ parseAndValidateCsvRules rulesfile rulestext
-  dbg2IO "rules" rules
-
-  -- parse the skip directive's value, if any
-  let skiplines = case getDirective "skip" rules of
-                    Nothing -> 0
-                    Just "" -> 1
-                    Just s  -> readDef (throwerr $ "could not parse skip value: " ++ show s) s
-
-  -- parse csv
-  -- parsec seems to fail if you pass it "-" here TODO: try again with megaparsec
-  let parsecfilename = if csvfile == "-" then "(stdin)" else csvfile
-  let separator = fromMaybe ',' (getDirective "separator" rules >>= parseSeparator)
-  dbg2IO "separator" separator
-  records <- (either throwerr id .
-              dbg2 "validateCsv" . validateCsv rules skiplines .
-              dbg2 "parseCsv")
-             `fmap` parseCsv separator parsecfilename csvdata
-  dbg1IO "first 3 csv records" $ take 3 records
-
-  -- identify header lines
-  -- let (headerlines, datalines) = identifyHeaderLines records
-  --     mfieldnames = lastMay headerlines
-
-  let
-    -- convert CSV records to transactions
-    txns = snd $ mapAccumL
-                   (\pos r ->
-                      let
-                        SourcePos name line col = pos
-                        line' = (mkPos . (+1) . unPos) line
-                        pos' = SourcePos name line' col
-                      in
-                        (pos, transactionFromCsvRecord pos' rules r)
-                   )
-                   (initialPos parsecfilename) records
-
-    -- Ensure transactions are ordered chronologically.
-    -- First, if the CSV records seem to be most-recent-first (because
-    -- there's an explicit "newest-first" directive, or there's more
-    -- than one date and the first date is more recent than the last):
-    -- reverse them to get same-date transactions ordered chronologically.
-    txns' =
-      (if newestfirst || mseemsnewestfirst == Just True then reverse else id) txns
-      where
-        newestfirst = dbg3 "newestfirst" $ isJust $ getDirective "newest-first" rules
-        mseemsnewestfirst = dbg3 "mseemsnewestfirst" $
-          case nub $ map tdate txns of
-            ds | length ds > 1 -> Just $ head ds > last ds
-            _                  -> Nothing
-    -- Second, sort by date.
-    txns'' = sortBy (comparing tdate) txns'
-
-  when (not rulesfileexists) $ do
-    dbg1IO "creating conversion rules file" rulesfile
-    writeFile rulesfile $ T.unpack rulestext
-
-  return $ Right nulljournal{jtxns=txns''}
-
-parseCsv :: Char -> FilePath -> Text -> IO (Either String CSV)
-parseCsv separator filePath csvdata =
-  case filePath of
-    "-" -> liftM (parseCassava separator "(stdin)") T.getContents
-    _   -> return $ parseCassava separator filePath csvdata
-
-parseCassava :: Char -> FilePath -> Text -> Either String CSV
-parseCassava separator path content =
-  either (Left . errorBundlePretty) (Right . parseResultToCsv) <$>
-  CassavaMP.decodeWith (decodeOptions separator) Cassava.NoHeader path $
-  BL.fromStrict $ T.encodeUtf8 content
-
-decodeOptions :: Char -> Cassava.DecodeOptions
-decodeOptions separator = Cassava.defaultDecodeOptions {
-                      Cassava.decDelimiter = fromIntegral (ord separator)
-                    }
-
-parseResultToCsv :: (Foldable t, Functor t) => t (t B.ByteString) -> CSV
-parseResultToCsv = toListList . unpackFields
-    where
-        toListList = toList . fmap toList
-        unpackFields  = (fmap . fmap) (T.unpack . T.decodeUtf8)
-
-printCSV :: CSV -> String
-printCSV records = unlined (printRecord `map` records)
-    where printRecord = concat . intersperse "," . map printField
-          printField f = "\"" ++ concatMap escape f ++ "\""
-          escape '"' = "\"\""
-          escape x = [x]
-          unlined = concat . intersperse "\n"
-
-- | Return the cleaned up and validated CSV data (can be empty), or an error.
-validateCsv :: CsvRules -> Int -> Either String CSV -> Either String [CsvRecord]
-validateCsv _ _           (Left err) = Left err
-validateCsv rules numhdrlines (Right rs) = validate $ applyConditionalSkips $ drop numhdrlines $ filternulls rs
-  where
-    filternulls = filter (/=[""])
-    skipCount r =
-      case (getEffectiveAssignment rules r "end", getEffectiveAssignment rules r "skip") of
-        (Nothing, Nothing) -> Nothing
-        (Just _, _) -> Just maxBound
-        (Nothing, Just "") -> Just 1
-        (Nothing, Just x) -> Just (read x)
-    applyConditionalSkips [] = []
-    applyConditionalSkips (r:rest) =
-      case skipCount r of
-        Nothing -> r:(applyConditionalSkips rest)
-        Just cnt -> applyConditionalSkips (drop (cnt-1) rest)
-    validate [] = Right []
-    validate rs@(_first:_)
-      | isJust lessthan2 = let r = fromJust lessthan2 in
-          Left $ printf "CSV record %s has less than two fields" (show r)
-      | otherwise        = Right rs
-      where
-        lessthan2 = headMay $ filter ((<2).length) rs
-
-- -- | The highest (0-based) field index referenced in the field
-- -- definitions, or -1 if no fields are defined.
-- maxFieldIndex :: CsvRules -> Int
-- maxFieldIndex r = maximumDef (-1) $ catMaybes [
--                    dateField r
--                   ,statusField r
--                   ,codeField r
--                   ,amountField r
--                   ,amountInField r
--                   ,amountOutField r
--                   ,currencyField r
--                   ,accountField r
--                   ,account2Field r
--                   ,date2Field r
--                   ]
-
-- rulesFileFor :: CliOpts -> FilePath -> FilePath
-- rulesFileFor CliOpts{rules_file_=Just f} _ = f
-- rulesFileFor CliOpts{rules_file_=Nothing} csvfile = replaceExtension csvfile ".rules"
+-- | Given a CSV file path, what would normally be the corresponding rules file ?
 rulesFileFor :: FilePath -> FilePath
 rulesFileFor = (++ ".rules")

+-- | Given a CSV rules file path, what would normally be the corresponding CSV file ?
 csvFileFor :: FilePath -> FilePath
 csvFileFor = reverse . drop 6 . reverse

@ -317,8 +164,143 @@ defaultRulesText csvfile = T.pack $ unlines
  ," account2 assets:bank:savings\n"
  ]

--------------------------------------------------------------------------------
-- Conversion rules parsing
+addDirective :: (DirectiveName, String) -> CsvRules -> CsvRules
+addDirective d r = r{rdirectives=d:rdirectives r}
+
+addAssignment :: (HledgerFieldName, FieldTemplate) -> CsvRules -> CsvRules
+addAssignment a r = r{rassignments=a:rassignments r}
+
+setIndexesAndAssignmentsFromList :: [CsvFieldName] -> CsvRules -> CsvRules
+setIndexesAndAssignmentsFromList fs r = addAssignmentsFromList fs . setCsvFieldIndexesFromList fs $ r
+
+setCsvFieldIndexesFromList :: [CsvFieldName] -> CsvRules -> CsvRules
+setCsvFieldIndexesFromList fs r = r{rcsvfieldindexes=zip fs [1..]}
+
+addAssignmentsFromList :: [CsvFieldName] -> CsvRules -> CsvRules
+addAssignmentsFromList fs r = foldl' maybeAddAssignment r journalfieldnames
+  where
+    maybeAddAssignment rules f = (maybe id addAssignmentFromIndex $ elemIndex f fs) rules
+      where
+        addAssignmentFromIndex i = addAssignment (f, "%"++show (i+1))
+
+addConditionalBlock :: ConditionalBlock -> CsvRules -> CsvRules
+addConditionalBlock b r = r{rconditionalblocks=b:rconditionalblocks r}
+
+getDirective :: DirectiveName -> CsvRules -> Maybe FieldTemplate
+getDirective directivename = lookup directivename . rdirectives
+
+instance ShowErrorComponent String where
+  showErrorComponent = id
+
+-- | Inline all files referenced by include directives in this hledger CSV rules text, recursively.
+-- Included file paths may be relative to the directory of the provided file path.
+-- This is done as a pre-parse step to simplify the CSV rules parser.
+expandIncludes :: FilePath -> Text -> IO Text
+expandIncludes dir content = mapM (expandLine dir) (T.lines content) >>= return . T.unlines
+  where
+    expandLine dir line =
+      case line of
+        (T.stripPrefix "include " -> Just f) -> expandIncludes dir' =<< T.readFile f'
+          where
+            f' = dir </> dropWhile isSpace (T.unpack f)
+            dir' = takeDirectory f'
+        _ -> return line
+
+-- | An error-throwing IO action that parses this text as CSV conversion rules
+-- and runs some extra validation checks. The file path is used in error messages.
+parseAndValidateCsvRules :: FilePath -> T.Text -> Either String CsvRules
+parseAndValidateCsvRules rulesfile s =
+  case parseCsvRules rulesfile s of
+    Left err    -> Left $ customErrorBundlePretty err
+    Right rules -> first makeFancyParseError $ validateRules rules
+  where
+    makeFancyParseError :: String -> String
+    makeFancyParseError errorString =
+      parseErrorPretty (FancyError 0 (S.singleton $ ErrorFail errorString) :: ParseError Text String)
+
+-- | Parse this text as CSV conversion rules. The file path is for error messages.
+parseCsvRules :: FilePath -> T.Text -> Either (ParseErrorBundle T.Text CustomErr) CsvRules
+-- parseCsvRules rulesfile s = runParser csvrulesfile nullrules{baseAccount=takeBaseName rulesfile} rulesfile s
+parseCsvRules rulesfile s =
+  runParser (evalStateT rulesp defrules) rulesfile s
+
+-- | Return the validated rules, or an error.
+validateRules :: CsvRules -> Either String CsvRules
+validateRules rules = do
+  unless (isAssigned "date")   $ Left "Please specify (at top level) the date field. Eg: date %1\n"
+  Right rules
+  where
+    isAssigned f = isJust $ getEffectiveAssignment rules [] f
+
+-- *** rules types
+
+-- | A set of data definitions and account-matching patterns sufficient to
+-- convert a particular CSV data file into meaningful journal transactions.
+data CsvRules = CsvRules {
+  rdirectives        :: [(DirectiveName,String)],
+    -- ^ top-level rules, as (keyword, value) pairs
+  rcsvfieldindexes   :: [(CsvFieldName, CsvFieldIndex)],
+    -- ^ csv field names and their column number, if declared by a fields list
+  rassignments       :: [(HledgerFieldName, FieldTemplate)],
+    -- ^ top-level assignments to hledger fields, as (field name, value template) pairs
+  rconditionalblocks :: [ConditionalBlock]
+    -- ^ conditional blocks, which containing additional assignments/rules to apply to matched csv records
+} deriving (Show, Eq)
+
+type CsvRulesParser a = StateT CsvRules SimpleTextParser a
+
+-- | The keyword of a CSV rule - "fields", "skip", "if", etc.
+type DirectiveName    = String
+
+-- | CSV field name.
+type CsvFieldName     = String
+
+-- | 1-based CSV column number.
+type CsvFieldIndex    = Int
+
+-- | Percent symbol followed by a CSV field name or column number. Eg: %date, %1.
+type CsvFieldReference = String
+
+-- | One of the standard hledger fields or pseudo-fields that can be assigned to.
+-- Eg date, account1, amount, amount1-in, date-format.
+type HledgerFieldName = String
+
+-- | A text value to be assigned to a hledger field, possibly
+-- containing csv field references to be interpolated.
+type FieldTemplate    = String
+
+-- | A strptime date parsing pattern, as supported by Data.Time.Format.
+type DateFormat       = String
+
+-- | A regular expression.
+type RegexpPattern    = String
+
+-- | A single test for matching a CSV record, in one way or another.
+data Matcher =
+    RecordMatcher RegexpPattern                   -- ^ match if this regexp matches the overall CSV record
+  | FieldMatcher CsvFieldReference RegexpPattern  -- ^ match if this regexp matches the referenced CSV field's value
+  deriving (Show, Eq)
+
+-- | A conditional block: a set of CSV record matchers, and a sequence
+-- of rules which will be enabled only if one or more of the matchers
+-- succeeds.
+--
+-- Three types of rule are allowed inside conditional blocks: field
+-- assignments, skip, end. (A skip or end rule is stored as if it was
+-- a field assignment, and executed in validateCsv. XXX)
+data ConditionalBlock = CB {
+   cbMatchers    :: [Matcher]
+  ,cbAssignments :: [(HledgerFieldName, FieldTemplate)]
+  } deriving (Show, Eq)
+
+defrules = CsvRules {
+  rdirectives=[],
+  rcsvfieldindexes=[],
+  rassignments=[],
+  rconditionalblocks=[]
+}
+
+-- *** rules parsers

 {-
 Grammar for the CSV conversion rules, more or less:
@ -383,146 +365,6 @@ DIGIT: 0-9

 -}

-{- |
-A set of data definitions and account-matching patterns sufficient to
-convert a particular CSV data file into meaningful journal transactions.
-}
-data CsvRules = CsvRules {
-  rdirectives        :: [(DirectiveName,String)],
-    -- ^ top-level rules, as (keyword, value) pairs
-  rcsvfieldindexes   :: [(CsvFieldName, CsvFieldIndex)],
-    -- ^ csv field names and their column number, if declared by a fields list
-  rassignments       :: [(HledgerFieldName, FieldTemplate)],
-    -- ^ top-level assignments to hledger fields, as (field name, value template) pairs
-  rconditionalblocks :: [ConditionalBlock]
-    -- ^ conditional blocks, which containing additional assignments/rules to apply to matched csv records
-} deriving (Show, Eq)
-
-type CsvRulesParser a = StateT CsvRules SimpleTextParser a
-
-- | The keyword of a CSV rule - "fields", "skip", "if", etc.
-type DirectiveName    = String
-- | CSV field name.
-type CsvFieldName     = String
-- | 1-based CSV column number.
-type CsvFieldIndex    = Int
-- | Percent symbol followed by a CSV field name or column number. Eg: %date, %1.
-type CsvFieldReference = String
-- | One of the standard hledger fields or pseudo-fields that can be assigned to.
-- Eg date, account1, amount, amount1-in, date-format.
-type HledgerFieldName = String
-- | A text value to be assigned to a hledger field, possibly
-- containing csv field references to be interpolated.
-type FieldTemplate    = String
-- | A strptime date parsing pattern, as supported by Data.Time.Format.
-type DateFormat       = String
-- | A regular expression.
-type RegexpPattern    = String
-
-- | A single test for matching a CSV record, in one way or another.
-data Matcher =
-    RecordMatcher RegexpPattern                   -- ^ match if this regexp matches the overall CSV record
-  | FieldMatcher CsvFieldReference RegexpPattern  -- ^ match if this regexp matches the referenced CSV field's value
-  deriving (Show, Eq)
-
-- | A conditional block: a set of CSV record matchers, and a sequence
-- of rules which will be enabled only if one or more of the matchers
-- succeeds.
--
-- Three types of rule are allowed inside conditional blocks: field
-- assignments, skip, end. (A skip or end rule is stored as if it was
-- a field assignment, and executed in validateCsv. XXX)
-data ConditionalBlock = CB {
-   cbMatchers    :: [Matcher]
-  ,cbAssignments :: [(HledgerFieldName, FieldTemplate)]
-  } deriving (Show, Eq)
-
-defrules = CsvRules {
-  rdirectives=[],
-  rcsvfieldindexes=[],
-  rassignments=[],
-  rconditionalblocks=[]
-}
-
-addDirective :: (DirectiveName, String) -> CsvRules -> CsvRules
-addDirective d r = r{rdirectives=d:rdirectives r}
-
-addAssignment :: (HledgerFieldName, FieldTemplate) -> CsvRules -> CsvRules
-addAssignment a r = r{rassignments=a:rassignments r}
-
-setIndexesAndAssignmentsFromList :: [CsvFieldName] -> CsvRules -> CsvRules
-setIndexesAndAssignmentsFromList fs r = addAssignmentsFromList fs . setCsvFieldIndexesFromList fs $ r
-
-setCsvFieldIndexesFromList :: [CsvFieldName] -> CsvRules -> CsvRules
-setCsvFieldIndexesFromList fs r = r{rcsvfieldindexes=zip fs [1..]}
-
-addAssignmentsFromList :: [CsvFieldName] -> CsvRules -> CsvRules
-addAssignmentsFromList fs r = foldl' maybeAddAssignment r journalfieldnames
-  where
-    maybeAddAssignment rules f = (maybe id addAssignmentFromIndex $ elemIndex f fs) rules
-      where
-        addAssignmentFromIndex i = addAssignment (f, "%"++show (i+1))
-
-addConditionalBlock :: ConditionalBlock -> CsvRules -> CsvRules
-addConditionalBlock b r = r{rconditionalblocks=b:rconditionalblocks r}
-
-getDirective :: DirectiveName -> CsvRules -> Maybe FieldTemplate
-getDirective directivename = lookup directivename . rdirectives
-
-instance ShowErrorComponent String where
-  showErrorComponent = id
-
-- Not used by hledger; just for lib users, 
-- | An pure-exception-throwing IO action that parses this file's content
-- as CSV conversion rules, interpolating any included files first,
-- and runs some extra validation checks.
-parseRulesFile :: FilePath -> ExceptT String IO CsvRules
-parseRulesFile f =
-  liftIO (readFilePortably f >>= expandIncludes (takeDirectory f))
-    >>= either throwError return . parseAndValidateCsvRules f
-
-- | Inline all files referenced by include directives in this hledger CSV rules text, recursively.
-- Included file paths may be relative to the directory of the provided file path.
-- This is done as a pre-parse step to simplify the CSV rules parser.
-expandIncludes :: FilePath -> Text -> IO Text
-expandIncludes dir content = mapM (expandLine dir) (T.lines content) >>= return . T.unlines
-  where
-    expandLine dir line =
-      case line of
-        (T.stripPrefix "include " -> Just f) -> expandIncludes dir' =<< T.readFile f'
-          where
-            f' = dir </> dropWhile isSpace (T.unpack f)
-            dir' = takeDirectory f'
-        _ -> return line
-
-- | An error-throwing IO action that parses this text as CSV conversion rules
-- and runs some extra validation checks. The file path is used in error messages.
-parseAndValidateCsvRules :: FilePath -> T.Text -> Either String CsvRules
-parseAndValidateCsvRules rulesfile s =
-  case parseCsvRules rulesfile s of
-    Left err    -> Left $ customErrorBundlePretty err
-    Right rules -> first makeFancyParseError $ validateRules rules
-  where
-    makeFancyParseError :: String -> String
-    makeFancyParseError errorString =
-      parseErrorPretty (FancyError 0 (S.singleton $ ErrorFail errorString) :: ParseError Text String)
-
-- | Parse this text as CSV conversion rules. The file path is for error messages.
-parseCsvRules :: FilePath -> T.Text -> Either (ParseErrorBundle T.Text CustomErr) CsvRules
-- parseCsvRules rulesfile s = runParser csvrulesfile nullrules{baseAccount=takeBaseName rulesfile} rulesfile s
-parseCsvRules rulesfile s =
-  runParser (evalStateT rulesp defrules) rulesfile s
-
-- | Return the validated rules, or an error.
-validateRules :: CsvRules -> Either String CsvRules
-validateRules rules = do
-  unless (isAssigned "date")   $ Left "Please specify (at top level) the date field. Eg: date %1\n"
-  Right rules
-  where
-    isAssigned f = isJust $ getEffectiveAssignment rules [] f
-
-- parsers
-
 rulesp :: CsvRulesParser CsvRules
 rulesp = do
  _ <- many $ choiceInState
@ -564,13 +406,10 @@ directives :: [String]
 directives =
  ["date-format"
  ,"separator"
-  -- ,"default-account1"
+  -- ,"default-account"
  -- ,"default-currency"
-  -- ,"skip-lines" -- old
  ,"skip"
  ,"newest-first"
-   -- ,"base-account"
-   -- ,"base-currency"
  , "balance-type"
  ]

@ -734,10 +573,182 @@ regexp = do
 --   -- ,"!="
 --   ]

--------------------------------------------------------------------------------
-- Converting CSV records to journal transactions
+-- ** reading csv files

-type CsvRecord = [String]
+-- | Read a Journal from the given CSV data (and filename, used for error
+-- messages), or return an error. Proceed as follows:
+--
+-- 1. parse CSV conversion rules from the specified rules file, or from
+--    the default rules file for the specified CSV file, if it exists,
+--    or throw a parse error; if it doesn't exist, use built-in default rules
+--
+-- 2. parse the CSV data, or throw a parse error
+--
+-- 3. convert the CSV records to transactions using the rules
+--
+-- 4. if the rules file didn't exist, create it with the default rules and filename
+--
+-- 5. return the transactions as a Journal
+-- 
+readJournalFromCsv :: Maybe FilePath -> FilePath -> Text -> IO (Either String Journal)
+readJournalFromCsv Nothing "-" _ = return $ Left "please use --rules-file when reading CSV from stdin"
+readJournalFromCsv mrulesfile csvfile csvdata =
+ handle (\(e::IOException) -> return $ Left $ show e) $ do
+
+  -- make and throw an IO exception.. which we catch and convert to an Either above ?
+  let throwerr = throw . userError
+
+  -- parse the csv rules
+  let rulesfile = fromMaybe (rulesFileFor csvfile) mrulesfile
+  rulesfileexists <- doesFileExist rulesfile
+  rulestext <-
+    if rulesfileexists
+    then do
+      dbg1IO "using conversion rules file" rulesfile
+      readFilePortably rulesfile >>= expandIncludes (takeDirectory rulesfile)
+    else
+      return $ defaultRulesText rulesfile
+  rules <- either throwerr return $ parseAndValidateCsvRules rulesfile rulestext
+  dbg2IO "rules" rules
+
+  -- parse the skip directive's value, if any
+  let skiplines = case getDirective "skip" rules of
+                    Nothing -> 0
+                    Just "" -> 1
+                    Just s  -> readDef (throwerr $ "could not parse skip value: " ++ show s) s
+
+  -- parse csv
+  -- parsec seems to fail if you pass it "-" here TODO: try again with megaparsec
+  let parsecfilename = if csvfile == "-" then "(stdin)" else csvfile
+  let separator = fromMaybe ',' (getDirective "separator" rules >>= parseSeparator)
+  dbg2IO "separator" separator
+  records <- (either throwerr id .
+              dbg2 "validateCsv" . validateCsv rules skiplines .
+              dbg2 "parseCsv")
+             `fmap` parseCsv separator parsecfilename csvdata
+  dbg1IO "first 3 csv records" $ take 3 records
+
+  -- identify header lines
+  -- let (headerlines, datalines) = identifyHeaderLines records
+  --     mfieldnames = lastMay headerlines
+
+  let
+    -- convert CSV records to transactions
+    txns = snd $ mapAccumL
+                   (\pos r ->
+                      let
+                        SourcePos name line col = pos
+                        line' = (mkPos . (+1) . unPos) line
+                        pos' = SourcePos name line' col
+                      in
+                        (pos, transactionFromCsvRecord pos' rules r)
+                   )
+                   (initialPos parsecfilename) records
+
+    -- Ensure transactions are ordered chronologically.
+    -- First, if the CSV records seem to be most-recent-first (because
+    -- there's an explicit "newest-first" directive, or there's more
+    -- than one date and the first date is more recent than the last):
+    -- reverse them to get same-date transactions ordered chronologically.
+    txns' =
+      (if newestfirst || mseemsnewestfirst == Just True then reverse else id) txns
+      where
+        newestfirst = dbg3 "newestfirst" $ isJust $ getDirective "newest-first" rules
+        mseemsnewestfirst = dbg3 "mseemsnewestfirst" $
+          case nub $ map tdate txns of
+            ds | length ds > 1 -> Just $ head ds > last ds
+            _                  -> Nothing
+    -- Second, sort by date.
+    txns'' = sortBy (comparing tdate) txns'
+
+  when (not rulesfileexists) $ do
+    dbg1IO "creating conversion rules file" rulesfile
+    writeFile rulesfile $ T.unpack rulestext
+
+  return $ Right nulljournal{jtxns=txns''}
+
+-- | Parse special separator names TAB and SPACE, or return the first
+-- character. Return Nothing on empty string
+parseSeparator :: String -> Maybe Char
+parseSeparator = specials . map toLower
+  where specials "space" = Just ' '
+        specials "tab"   = Just '\t'
+        specials (x:_)   = Just x
+        specials []      = Nothing
+
+parseCsv :: Char -> FilePath -> Text -> IO (Either String CSV)
+parseCsv separator filePath csvdata =
+  case filePath of
+    "-" -> liftM (parseCassava separator "(stdin)") T.getContents
+    _   -> return $ parseCassava separator filePath csvdata
+
+parseCassava :: Char -> FilePath -> Text -> Either String CSV
+parseCassava separator path content =
+  either (Left . errorBundlePretty) (Right . parseResultToCsv) <$>
+  CassavaMP.decodeWith (decodeOptions separator) Cassava.NoHeader path $
+  BL.fromStrict $ T.encodeUtf8 content
+
+decodeOptions :: Char -> Cassava.DecodeOptions
+decodeOptions separator = Cassava.defaultDecodeOptions {
+                      Cassava.decDelimiter = fromIntegral (ord separator)
+                    }
+
+parseResultToCsv :: (Foldable t, Functor t) => t (t B.ByteString) -> CSV
+parseResultToCsv = toListList . unpackFields
+    where
+        toListList = toList . fmap toList
+        unpackFields  = (fmap . fmap) (T.unpack . T.decodeUtf8)
+
+printCSV :: CSV -> String
+printCSV records = unlined (printRecord `map` records)
+    where printRecord = concat . intersperse "," . map printField
+          printField f = "\"" ++ concatMap escape f ++ "\""
+          escape '"' = "\"\""
+          escape x = [x]
+          unlined = concat . intersperse "\n"
+
+-- | Return the cleaned up and validated CSV data (can be empty), or an error.
+validateCsv :: CsvRules -> Int -> Either String CSV -> Either String [CsvRecord]
+validateCsv _ _           (Left err) = Left err
+validateCsv rules numhdrlines (Right rs) = validate $ applyConditionalSkips $ drop numhdrlines $ filternulls rs
+  where
+    filternulls = filter (/=[""])
+    skipCount r =
+      case (getEffectiveAssignment rules r "end", getEffectiveAssignment rules r "skip") of
+        (Nothing, Nothing) -> Nothing
+        (Just _, _) -> Just maxBound
+        (Nothing, Just "") -> Just 1
+        (Nothing, Just x) -> Just (read x)
+    applyConditionalSkips [] = []
+    applyConditionalSkips (r:rest) =
+      case skipCount r of
+        Nothing -> r:(applyConditionalSkips rest)
+        Just cnt -> applyConditionalSkips (drop (cnt-1) rest)
+    validate [] = Right []
+    validate rs@(_first:_)
+      | isJust lessthan2 = let r = fromJust lessthan2 in
+          Left $ printf "CSV record %s has less than two fields" (show r)
+      | otherwise        = Right rs
+      where
+        lessthan2 = headMay $ filter ((<2).length) rs
+
+-- -- | The highest (0-based) field index referenced in the field
+-- -- definitions, or -1 if no fields are defined.
+-- maxFieldIndex :: CsvRules -> Int
+-- maxFieldIndex r = maximumDef (-1) $ catMaybes [
+--                    dateField r
+--                   ,statusField r
+--                   ,codeField r
+--                   ,amountField r
+--                   ,amountInField r
+--                   ,amountOutField r
+--                   ,currencyField r
+--                   ,accountField r
+--                   ,account2Field r
+--                   ,date2Field r
+--                   ]
+
+-- ** converting csv records to transactions

 showRules rules record =
  unlines $ catMaybes [ (("the "++fld++" rule is: ")++) <$> getEffectiveAssignment rules record fld | fld <- journalfieldnames]
@ -940,6 +951,7 @@ mkPosting rules record number accountFld amountFld amountInFld amountOutFld bala
                              -- XXX what's this needed for ? Test & document, or drop.
                              -- Also, this the only place we interpolate in a keyword rule, I think.
                               `withDefault` ruleval ("default-account" ++ number))
+    -- XXX what's this needed for ? Test & document, or drop.
    mdefaultcurrency = rule "default-currency"
    currency = fromMaybe (fromMaybe "" mdefaultcurrency) $
               fieldval ("currency"++number) `withDefault` fieldval "currency"
@ -985,7 +997,7 @@ improveUnknownAccountName p@Posting{..}
 -- possibly set by a balance-type rule.
 -- The CSV rules and current record are also provided, to be shown in case
 -- balance-type's argument is bad (XXX refactor).
-mkBalanceAssertion :: CsvRules -> Record -> (Amount, GenericSourcePos) -> BalanceAssertion
+mkBalanceAssertion :: CsvRules -> CsvRecord -> (Amount, GenericSourcePos) -> BalanceAssertion
 mkBalanceAssertion rules record (amt, pos) = assrt{baamount=amt, baposition=pos}
  where
    assrt =
@ -1162,8 +1174,7 @@ parseDateWithCustomOrDefaultFormats mformat s = firstJust $ map parsewith format
               (:[])
                mformat

--------------------------------------------------------------------------------
-- tests
+-- ** tests

 tests_CsvReader = tests "CsvReader" [
   tests "parseCsvRules" [
--- a/hledger/Hledger/Cli/Commands/Register.hs
+++ b/hledger/Hledger/Cli/Commands/Register.hs
@ -24,7 +24,7 @@ import Data.Maybe
 -- import Data.Text (Text)
 import qualified Data.Text as T
 import System.Console.CmdArgs.Explicit
-import Hledger.Read.CsvReader (CSV, Record, printCSV)
+import Hledger.Read.CsvReader (CSV, CsvRecord, printCSV)

 import Hledger
 import Hledger.Cli.CliOptions
@ -71,7 +71,7 @@ postingsReportAsCsv (_,is) =
  :
  map postingsReportItemAsCsvRecord is

-postingsReportItemAsCsvRecord :: PostingsReportItem -> Record
+postingsReportItemAsCsvRecord :: PostingsReportItem -> CsvRecord
 postingsReportItemAsCsvRecord (_, _, _, p, b) = [idx,date,code,desc,acct,amt,bal]
  where
    idx  = show $ maybe 0 tindex $ ptransaction p