From 4d7a809c4a3d702da650a042d2012564dbe18704 Mon Sep 17 00:00:00 2001 From: Simon Michael Date: Sat, 10 Mar 2012 21:55:48 +0000 Subject: [PATCH] cleanups and early code for csv reader based on convert --- hledger-lib/Hledger/Read.hs | 19 +- hledger-lib/Hledger/Read/CsvReader.hs | 584 ++++++++++++++++++++++++++ 2 files changed, 597 insertions(+), 6 deletions(-) create mode 100644 hledger-lib/Hledger/Read/CsvReader.hs diff --git a/hledger-lib/Hledger/Read.hs b/hledger-lib/Hledger/Read.hs index 0586b5ea0..d08498cd4 100644 --- a/hledger-lib/Hledger/Read.hs +++ b/hledger-lib/Hledger/Read.hs @@ -36,6 +36,7 @@ import Hledger.Data.Types (Journal(..), Reader(..)) import Hledger.Data.Journal (nullctx) import Hledger.Read.JournalReader as JournalReader import Hledger.Read.TimelogReader as TimelogReader +import Hledger.Read.CsvReader as CsvReader import Hledger.Utils import Prelude hiding (getContents, writeFile) import Hledger.Utils.UTF8 (getContents, hGetContents, writeFile) @@ -51,6 +52,7 @@ readers :: [Reader] readers = [ JournalReader.reader ,TimelogReader.reader + -- ,CsvReader.reader ] -- | All the data formats we can read. @@ -74,18 +76,22 @@ journalFromPathAndString format fp s = do let readerstotry = case format of Nothing -> readers Just f -> case readerForFormat f of Just r -> [r] Nothing -> [] - (errors, journals) <- partitionEithers `fmap` mapM tryReader readerstotry + (errors, journals) <- partitionEithers `fmap` mapM (tryReader fp s) readerstotry case journals of j:_ -> return $ Right j - _ -> return $ Left $ bestErrorMsg errors - where - tryReader r = (runErrorT . (rParser r) fp) s + _ -> return $ Left $ bestErrorMsg errors fp s + -- where + +tryReader :: FilePath -> String -> Reader -> IO (Either String Journal) +tryReader fp s r = do -- printf "trying to read %s format\n" (rFormat r) + (runErrorT . (rParser r) fp) s + -- unknown format - bestErrorMsg [] = printf "could not parse %sdata in %s" (fmt formats) fp +bestErrorMsg [] fp _ = printf "could not parse %sdata in %s" (fmt formats) fp where fmt [] = "" fmt [f] = f ++ " " fmt fs = intercalate ", " (init fs) ++ " or " ++ last fs ++ " " -- one or more errors - report (the most appropriate ?) one - bestErrorMsg es = printf "could not parse %s data in %s\n%s" (rFormat r) fp e +bestErrorMsg es fp s = printf "could not parse %s data in %s\n%s" (rFormat r) fp e where (r,e) = headDef (head readers, head es) $ filter detects $ zip readers es detects (r,_) = (rDetector r) fp s @@ -152,6 +158,7 @@ tests_Hledger_Read = TestList [ tests_Hledger_Read_JournalReader, tests_Hledger_Read_TimelogReader, + tests_Hledger_Read_CsvReader, "journalFile" ~: do assertBool "journalFile should parse an empty file" (isRight $ parseWithCtx nullctx JournalReader.journalFile "") diff --git a/hledger-lib/Hledger/Read/CsvReader.hs b/hledger-lib/Hledger/Read/CsvReader.hs new file mode 100644 index 000000000..b3172e22f --- /dev/null +++ b/hledger-lib/Hledger/Read/CsvReader.hs @@ -0,0 +1,584 @@ +{-| + +A reader for CSV files. Uses optional extra rules to help interpret the +data, like the convert command. + +-} + +module Hledger.Read.CsvReader ( + reader, + tests_Hledger_Read_CsvReader +) +where +import Control.Monad +import Control.Monad.Error +import Test.HUnit +-- import Text.ParserCombinators.Parsec hiding (parse) +import Data.List +import Data.Maybe +import Data.Ord +import Data.Time.Format (parseTime) +import Safe +import System.Directory (doesFileExist) +import System.Exit (exitFailure) +import System.FilePath (takeBaseName, replaceExtension) +import System.IO (stderr) +import System.Locale (defaultTimeLocale) +import Test.HUnit +import Text.CSV (parseCSV, parseCSVFromFile, CSV) +import Text.ParserCombinators.Parsec +import Text.Printf (hPrintf) + +import Hledger.Data +import Hledger.Read.Utils +import Prelude hiding (getContents) +import Hledger.Utils.UTF8 (getContents) +import Hledger.Utils +import Hledger.Data.FormatStrings as FormatStrings +import Hledger.Read.JournalReader (ledgeraccountname, someamount) +-- import Hledger.Read.JournalReader (ledgerDirective, ledgerHistoricalPrice, +-- ledgerDefaultYear, emptyLine, ledgerdatetime) + +reader :: Reader +reader = Reader format detect parse_ + +format :: String +format = "csv" + +-- | Does the given file path and data look like CSV ? +detect :: FilePath -> String -> Bool +detect f _ = fileSuffix f == format + +-- | Parse and post-process a "Journal" from CSV data, or give an error. +-- XXX currently ignores the string and reads from the file path +parse_ :: FilePath -> String -> ErrorT String IO Journal +parse_ f s = do + r <- liftIO $ journalFromCsv f s + case r of Left e -> throwError e + Right j -> return j + +-- csvFile :: GenParser Char JournalContext (JournalUpdate,JournalContext) +-- csvFile = do items <- many timelogItem +-- eof +-- ctx <- getState +-- return (liftM (foldr (.) id) $ sequence items, ctx) + + + +-- XXX copied from Convert.hs + +{- | +A set of data definitions and account-matching patterns sufficient to +convert a particular CSV data file into meaningful journal transactions. See above. +-} +data CsvRules = CsvRules { + dateField :: Maybe FieldPosition, + dateFormat :: Maybe String, + statusField :: Maybe FieldPosition, + codeField :: Maybe FieldPosition, + descriptionField :: [FormatString], + amountField :: Maybe FieldPosition, + amountInField :: Maybe FieldPosition, + amountOutField :: Maybe FieldPosition, + currencyField :: Maybe FieldPosition, + baseCurrency :: Maybe String, + accountField :: Maybe FieldPosition, + account2Field :: Maybe FieldPosition, + effectiveDateField :: Maybe FieldPosition, + baseAccount :: AccountName, + accountRules :: [AccountRule] +} deriving (Show, Eq) + +nullrules = CsvRules { + dateField=Nothing, + dateFormat=Nothing, + statusField=Nothing, + codeField=Nothing, + descriptionField=[], + amountField=Nothing, + amountInField=Nothing, + amountOutField=Nothing, + currencyField=Nothing, + baseCurrency=Nothing, + accountField=Nothing, + account2Field=Nothing, + effectiveDateField=Nothing, + baseAccount="unknown", + accountRules=[] +} + +type FieldPosition = Int + +type AccountRule = ( + [(String, Maybe String)] -- list of regex match patterns with optional replacements + ,AccountName -- account name to use for a transaction matching this rule + ) + +type CsvRecord = [String] + + +-- | Read the CSV file named as an argument and print equivalent journal transactions, +-- using/creating a .rules file. +journalFromCsv :: FilePath -> String -> IO (Either String Journal) +journalFromCsv csvfile content = do + let usingStdin = csvfile == "-" + -- rulesFileSpecified = isJust $ rules_file_ opts + rulesfile = rulesFileFor csvfile + -- when (usingStdin && (not rulesFileSpecified)) $ error' "please use --rules-file to specify a rules file when converting stdin" + csvparse <- parseCsv csvfile content + let records = case csvparse of + Left e -> error' $ show e + Right rs -> filter (/= [""]) rs + exists <- doesFileExist rulesfile + if (not exists) then do + hPrintf stderr "creating conversion rules file %s, edit this file for better results\n" rulesfile + writeFile rulesfile initialRulesFileContent + else + hPrintf stderr "using conversion rules file %s\n" rulesfile + rules <- liftM (either (error'.show) id) $ parseCsvRulesFile rulesfile + let invalid = validateRules rules + -- when (debug_ opts) $ hPrintf stderr "rules: %s\n" (show rules) + when (isJust invalid) $ error (fromJust invalid) + let requiredfields = max 2 (maxFieldIndex rules + 1) + badrecords = take 1 $ filter ((< requiredfields).length) records + if null badrecords + then do + return $ Right nulljournal{jtxns=sortBy (comparing tdate) $ map (transactionFromCsvRecord rules) records} + else + return $ Left (unlines [ + "Warning, at least one CSV record does not contain a field referenced by the" + ,"conversion rules file, or has less than two fields. Are you converting a" + ,"valid CSV file ? First bad record:" + , show $ head badrecords + ]) + +parseCsv :: FilePath -> String -> IO (Either ParseError CSV) +parseCsv path content = + case path of + "-" -> liftM (parseCSV "(stdin)") getContents + _ -> return $ parseCSV path content + +-- | The highest (0-based) field index referenced in the field +-- definitions, or -1 if no fields are defined. +maxFieldIndex :: CsvRules -> Int +maxFieldIndex r = maximumDef (-1) $ catMaybes [ + dateField r + ,statusField r + ,codeField r + ,amountField r + ,amountInField r + ,amountOutField r + ,currencyField r + ,accountField r + ,account2Field r + ,effectiveDateField r + ] + +-- rulesFileFor :: CliOpts -> FilePath -> FilePath +-- rulesFileFor CliOpts{rules_file_=Just f} _ = f +-- rulesFileFor CliOpts{rules_file_=Nothing} csvfile = replaceExtension csvfile ".rules" +rulesFileFor :: FilePath -> FilePath +rulesFileFor = flip replaceExtension ".rules" + +initialRulesFileContent :: String +initialRulesFileContent = let prognameandversion = "hledger" in + "# csv conversion rules file generated by " ++ prognameandversion ++ "\n" ++ + "# Add rules to this file for more accurate conversion, see\n"++ + "# http://hledger.org/MANUAL.html#convert\n" ++ + "\n" ++ + "base-account assets:bank:checking\n" ++ + "date-field 0\n" ++ + "description-field 4\n" ++ + "amount-field 1\n" ++ + "base-currency $\n" ++ + "\n" ++ + "# account-assigning rules\n" ++ + "\n" ++ + "SPECTRUM\n" ++ + "expenses:health:gym\n" ++ + "\n" ++ + "ITUNES\n" ++ + "BLKBSTR=BLOCKBUSTER\n" ++ + "expenses:entertainment\n" ++ + "\n" ++ + "(TO|FROM) SAVINGS\n" ++ + "assets:bank:savings\n" + +validateRules :: CsvRules -> Maybe String +validateRules rules = let + hasAmount = isJust $ amountField rules + hasIn = isJust $ amountInField rules + hasOut = isJust $ amountOutField rules + in case (hasAmount, hasIn, hasOut) of + (True, True, _) -> Just "Don't specify amount-in-field when specifying amount-field" + (True, _, True) -> Just "Don't specify amount-out-field when specifying amount-field" + (_, False, True) -> Just "Please specify amount-in-field when specifying amount-out-field" + (_, True, False) -> Just "Please specify amount-out-field when specifying amount-in-field" + (False, False, False) -> Just "Please specify either amount-field, or amount-in-field and amount-out-field" + _ -> Nothing + +-- rules file parser + +parseCsvRulesFile :: FilePath -> IO (Either ParseError CsvRules) +parseCsvRulesFile f = do + s <- readFile f + return $ parseCsvRules f s + +parseCsvRules :: FilePath -> String -> Either ParseError CsvRules +parseCsvRules rulesfile s = runParser csvrulesfile nullrules{baseAccount=takeBaseName rulesfile} rulesfile s + +csvrulesfile :: GenParser Char CsvRules CsvRules +csvrulesfile = do + many blankorcommentline + many definitions + r <- getState + ars <- many accountrule + many blankorcommentline + eof + return r{accountRules=ars} + +definitions :: GenParser Char CsvRules () +definitions = do + choice' [ + datefield + ,dateformat + ,statusfield + ,codefield + ,descriptionfield + ,amountfield + ,amountinfield + ,amountoutfield + ,currencyfield + ,accountfield + ,account2field + ,effectivedatefield + ,basecurrency + ,baseaccount + ,commentline + ] "definition" + return () + +datefield = do + string "date-field" + many1 spacenonewline + v <- restofline + updateState (\r -> r{dateField=readMay v}) + +effectivedatefield = do + string "effective-date-field" + many1 spacenonewline + v <- restofline + updateState (\r -> r{effectiveDateField=readMay v}) + +dateformat = do + string "date-format" + many1 spacenonewline + v <- restofline + updateState (\r -> r{dateFormat=Just v}) + +codefield = do + string "code-field" + many1 spacenonewline + v <- restofline + updateState (\r -> r{codeField=readMay v}) + +statusfield = do + string "status-field" + many1 spacenonewline + v <- restofline + updateState (\r -> r{statusField=readMay v}) + +descriptionFieldValue :: GenParser Char st [FormatString] +descriptionFieldValue = do +-- try (fieldNo <* spacenonewline) + try fieldNo + <|> formatStrings + where + fieldNo = many1 digit >>= \x -> return [FormatField False Nothing Nothing $ FieldNo $ read x] + +descriptionfield = do + string "description-field" + many1 spacenonewline + formatS <- descriptionFieldValue + restofline + updateState (\x -> x{descriptionField=formatS}) + +amountfield = do + string "amount-field" + many1 spacenonewline + v <- restofline + x <- updateState (\r -> r{amountField=readMay v}) + return x + +amountinfield = do + choice [string "amount-in-field", string "in-field"] + many1 spacenonewline + v <- restofline + updateState (\r -> r{amountInField=readMay v}) + +amountoutfield = do + choice [string "amount-out-field", string "out-field"] + many1 spacenonewline + v <- restofline + updateState (\r -> r{amountOutField=readMay v}) + +currencyfield = do + string "currency-field" + many1 spacenonewline + v <- restofline + updateState (\r -> r{currencyField=readMay v}) + +accountfield = do + string "account-field" + many1 spacenonewline + v <- restofline + updateState (\r -> r{accountField=readMay v}) + +account2field = do + string "account2-field" + many1 spacenonewline + v <- restofline + updateState (\r -> r{account2Field=readMay v}) + +basecurrency = do + choice [string "base-currency", string "currency"] + many1 spacenonewline + v <- restofline + updateState (\r -> r{baseCurrency=Just v}) + +baseaccount = do + string "base-account" + many1 spacenonewline + v <- ledgeraccountname + optional newline + updateState (\r -> r{baseAccount=v}) + +accountrule :: GenParser Char CsvRules AccountRule +accountrule = do + many blankorcommentline + pats <- many1 matchreplacepattern + guard $ length pats >= 2 + let pats' = init pats + acct = either (fail.show) id $ runParser ledgeraccountname () "" $ fst $ last pats + many blankorcommentline + return (pats',acct) + "account rule" + +blanklines = many1 blankline + +blankline = many spacenonewline >> newline >> return () "blank line" + +commentchar = oneOf ";#" + +commentline = many spacenonewline >> commentchar >> restofline >> return () "comment line" + +blankorcommentline = choice' [blankline, commentline] + +matchreplacepattern = do + notFollowedBy commentchar + matchpat <- many1 (noneOf "=\n") + replpat <- optionMaybe $ do {char '='; many $ noneOf "\n"} + newline + return (matchpat,replpat) + +-- csv record conversion +formatD :: CsvRecord -> Bool -> Maybe Int -> Maybe Int -> Field -> String +formatD record leftJustified min max f = case f of + FieldNo n -> maybe "" show $ atMay record n + -- Some of these might in theory in read from fields + FormatStrings.Account -> "" + DepthSpacer -> "" + Total -> "" + DefaultDate -> "" + Description -> "" + where + show = formatValue leftJustified min max + +formatDescription :: CsvRecord -> [FormatString] -> String +formatDescription _ [] = "" +formatDescription record (f:fs) = s ++ (formatDescription record fs) + where s = case f of + FormatLiteral l -> l + FormatField leftJustified min max field -> formatD record leftJustified min max field + +transactionFromCsvRecord :: CsvRules -> CsvRecord -> Transaction +transactionFromCsvRecord rules fields = + let + date = parsedate $ normaliseDate (dateFormat rules) $ maybe "1900/1/1" (atDef "" fields) (dateField rules) + effectivedate = do idx <- effectiveDateField rules + return $ parsedate $ normaliseDate (dateFormat rules) $ (atDef "" fields) idx + status = maybe False (null . strip . (atDef "" fields)) (statusField rules) + code = maybe "" (atDef "" fields) (codeField rules) + desc = formatDescription fields (descriptionField rules) + comment = "" + precomment = "" + baseacc = maybe (baseAccount rules) (atDef "" fields) (accountField rules) + amountstr = getAmount rules fields + amountstr' = strnegate amountstr where strnegate ('-':s) = s + strnegate s = '-':s + currency = maybe (fromMaybe "" $ baseCurrency rules) (atDef "" fields) (currencyField rules) + amountstr'' = currency ++ amountstr' + amountparse = runParser someamount nullctx "" amountstr'' + amount = either (const nullmixedamt) id amountparse + -- Using costOfMixedAmount here to allow complex costs like "10 GBP @@ 15 USD". + -- Aim is to have "10 GBP @@ 15 USD" applied to account "acct", but have "-15USD" applied to "baseacct" + baseamount = costOfMixedAmount amount + unknownacct | (readDef 0 amountstr' :: Double) < 0 = "income:unknown" + | otherwise = "expenses:unknown" + (acct',newdesc) = identify (accountRules rules) unknownacct desc + acct = maybe acct' (atDef "" fields) (account2Field rules) + t = Transaction { + tdate=date, + teffectivedate=effectivedate, + tstatus=status, + tcode=code, + tdescription=newdesc, + tcomment=comment, + tpreceding_comment_lines=precomment, + tmetadata=[], + tpostings=[ + Posting { + pstatus=False, + paccount=acct, + pamount=amount, + pcomment="", + ptype=RegularPosting, + pmetadata=[], + ptransaction=Just t + }, + Posting { + pstatus=False, + paccount=baseacc, + pamount=(-baseamount), + pcomment="", + ptype=RegularPosting, + pmetadata=[], + ptransaction=Just t + } + ] + } + in t + +-- | Convert some date string with unknown format to YYYY/MM/DD. +normaliseDate :: Maybe String -- ^ User-supplied date format: this should be tried in preference to all others + -> String -> String +normaliseDate mb_user_format s = + let parsewith = flip (parseTime defaultTimeLocale) s in + maybe (error' $ "could not parse \""++s++"\" as a date, consider adding a date-format directive or upgrading") + showDate $ + firstJust $ (map parsewith $ + maybe [] (:[]) mb_user_format + -- the - modifier requires time-1.2.0.5, released + -- in 2011/5, so for now we emulate it for wider + -- compatibility. time < 1.2.0.5 also has a buggy + -- %y which we don't do anything about. + -- ++ [ + -- "%Y/%m/%d" + -- ,"%Y/%-m/%-d" + -- ,"%Y-%m-%d" + -- ,"%Y-%-m-%-d" + -- ,"%m/%d/%Y" + -- ,"%-m/%-d/%Y" + -- ,"%m-%d-%Y" + -- ,"%-m-%-d-%Y" + -- ] + ) + ++ [ + parseTime defaultTimeLocale "%Y/%m/%e" s + ,parseTime defaultTimeLocale "%Y-%m-%e" s + ,parseTime defaultTimeLocale "%m/%e/%Y" s + ,parseTime defaultTimeLocale "%m-%e-%Y" s + ,parseTime defaultTimeLocale "%Y/%m/%e" (take 5 s ++ "0" ++ drop 5 s) + ,parseTime defaultTimeLocale "%Y-%m-%e" (take 5 s ++ "0" ++ drop 5 s) + ,parseTime defaultTimeLocale "%m/%e/%Y" ('0':s) + ,parseTime defaultTimeLocale "%m-%e-%Y" ('0':s) + ] + +-- | Apply account matching rules to a transaction description to obtain +-- the most appropriate account and a new description. +identify :: [AccountRule] -> String -> String -> (String,String) +identify rules defacct desc | null matchingrules = (defacct,desc) + | otherwise = (acct,newdesc) + where + matchingrules = filter ismatch rules :: [AccountRule] + where ismatch = any ((`regexMatchesCI` desc) . fst) . fst + (prs,acct) = head matchingrules + p_ms_r = filter (\(_,m,_) -> m) $ map (\(p,r) -> (p, p `regexMatchesCI` desc, r)) prs + (p,_,r) = head p_ms_r + newdesc = case r of Just repl -> regexReplaceCI p repl desc + Nothing -> desc + +caseinsensitive = ("(?i)"++) + +getAmount :: CsvRules -> CsvRecord -> String +getAmount rules fields = case amountField rules of + Just f -> maybe "" (atDef "" fields) $ Just f + Nothing -> + case (i, o) of + (x, "") -> x + ("", x) -> "-"++x + p -> error' $ "using amount-in-field and amount-out-field, found a value in both fields: "++show p + where + i = maybe "" (atDef "" fields) (amountInField rules) + o = maybe "" (atDef "" fields) (amountOutField rules) + +tests_Hledger_Read_CsvReader = TestList (test_parser ++ test_description_parsing) + +test_description_parsing = [ + "description-field 1" ~: assertParseDescription "description-field 1\n" [FormatField False Nothing Nothing (FieldNo 1)] + , "description-field 1 " ~: assertParseDescription "description-field 1 \n" [FormatField False Nothing Nothing (FieldNo 1)] + , "description-field %(1)" ~: assertParseDescription "description-field %(1)\n" [FormatField False Nothing Nothing (FieldNo 1)] + , "description-field %(1)/$(2)" ~: assertParseDescription "description-field %(1)/%(2)\n" [ + FormatField False Nothing Nothing (FieldNo 1) + , FormatLiteral "/" + , FormatField False Nothing Nothing (FieldNo 2) + ] + ] + where + assertParseDescription string expected = do assertParseEqual (parseDescription string) (nullrules {descriptionField = expected}) + parseDescription :: String -> Either ParseError CsvRules + parseDescription x = runParser descriptionfieldWrapper nullrules "(unknown)" x + descriptionfieldWrapper :: GenParser Char CsvRules CsvRules + descriptionfieldWrapper = do + descriptionfield + r <- getState + return r + +test_parser = [ + + "convert rules parsing: empty file" ~: do + -- let assertMixedAmountParse parseresult mixedamount = + -- (either (const "parse error") showMixedAmountDebug parseresult) ~?= (showMixedAmountDebug mixedamount) + assertParseEqual (parseCsvRules "unknown" "") nullrules + + ,"convert rules parsing: accountrule" ~: do + assertParseEqual (parseWithCtx nullrules accountrule "A\na\n") -- leading blank line required + ([("A",Nothing)], "a") + + ,"convert rules parsing: trailing comments" ~: do + assertParse (parseWithCtx nullrules csvrulesfile "A\na\n# \n#\n") + + ,"convert rules parsing: trailing blank lines" ~: do + assertParse (parseWithCtx nullrules csvrulesfile "A\na\n\n \n") + + -- not supported + -- ,"convert rules parsing: no final newline" ~: do + -- assertParse (parseWithCtx nullrules csvrulesfile "A\na") + -- assertParse (parseWithCtx nullrules csvrulesfile "A\na\n# \n#") + -- assertParse (parseWithCtx nullrules csvrulesfile "A\na\n\n ") + + -- (nullrules{ + -- -- dateField=Maybe FieldPosition, + -- -- statusField=Maybe FieldPosition, + -- -- codeField=Maybe FieldPosition, + -- -- descriptionField=Maybe FieldPosition, + -- -- amountField=Maybe FieldPosition, + -- -- currencyField=Maybe FieldPosition, + -- -- baseCurrency=Maybe String, + -- -- baseAccount=AccountName, + -- accountRules=[ + -- ([("A",Nothing)], "a") + -- ] + -- }) + + ]