From cf3d21afef5a6a8886f05853f30c9360d2882dd3 Mon Sep 17 00:00:00 2001 From: Simon Michael Date: Fri, 9 May 2014 17:55:32 -0700 Subject: [PATCH] csv and general reader fixes, cleanups - The CSV reader no longer writes a "(stdin).rules" file when reading from stdin. - Selection of reader(s) is now smarter when input is coming from stdin. Previously, all readers were considered applicable for stdin. This meant that when reading a CSV file from stdin, the journal and timelog readers were always tried first, and if the CSV file was unparseable, you'd see the first (journal) reader's error instead of the CSV reader's. Now, the readers do some basic content sniffing when reading stdin, so it generally tries only the one right reader and we'll see the right errors. - The read system now has more debug output. --- hledger-lib/Hledger/Data/Types.hs | 2 +- hledger-lib/Hledger/Read.hs | 8 +++--- hledger-lib/Hledger/Read/CsvReader.hs | 32 ++++++++++++++--------- hledger-lib/Hledger/Read/JournalReader.hs | 10 ++++--- hledger-lib/Hledger/Read/TimelogReader.hs | 10 ++++--- 5 files changed, 36 insertions(+), 26 deletions(-) diff --git a/hledger-lib/Hledger/Data/Types.hs b/hledger-lib/Hledger/Data/Types.hs index 68faf4ad8..21902f19f 100644 --- a/hledger-lib/Hledger/Data/Types.hs +++ b/hledger-lib/Hledger/Data/Types.hs @@ -182,7 +182,7 @@ data Reader = Reader { ,rParser :: Maybe FilePath -> FilePath -> String -> ErrorT String IO Journal } -instance Show Reader where show r = "Reader for "++rFormat r +instance Show Reader where show r = rFormat r ++ " reader" -- format strings diff --git a/hledger-lib/Hledger/Read.hs b/hledger-lib/Hledger/Read.hs index bc1a9a5cf..37334fdb9 100644 --- a/hledger-lib/Hledger/Read.hs +++ b/hledger-lib/Hledger/Read.hs @@ -116,7 +116,6 @@ tests_readJournal' = [ -- A CSV conversion rules file may also be specified for use by the CSV reader. readJournal :: Maybe StorageFormat -> Maybe FilePath -> Maybe FilePath -> String -> IO (Either String Journal) readJournal format rulesfile path s = - -- trace (show (format, rulesfile, path)) $ tryReaders $ readersFor (format, path, s) where -- try each reader in turn, returning the error of the first if all fail @@ -126,8 +125,9 @@ readJournal format rulesfile path s = firstSuccessOrBestError :: [String] -> [Reader] -> IO (Either String Journal) firstSuccessOrBestError [] [] = return $ Left "no readers found" firstSuccessOrBestError errs (r:rs) = do - -- printf "trying %s reader\n" (rFormat r) + dbgAtM 1 "trying reader" (rFormat r) result <- (runErrorT . (rParser r) rulesfile path') s + dbgAtM 1 "reader result" $ either id show result case result of Right j -> return $ Right j -- success! Left e -> firstSuccessOrBestError (errs++[e]) rs -- keep trying firstSuccessOrBestError (e:_) [] = return $ Left e -- none left, return first error @@ -136,11 +136,11 @@ readJournal format rulesfile path s = -- | Which readers are worth trying for this (possibly unspecified) format, filepath, and data ? readersFor :: (Maybe StorageFormat, Maybe FilePath, String) -> [Reader] readersFor (format,path,s) = + dbg ("possible readers for "++show (format,path,elideRight 30 s)) $ case format of Just f -> case readerForStorageFormat f of Just r -> [r] Nothing -> [] Nothing -> case path of Nothing -> readers - Just "-" -> readers Just p -> case readersForPathAndData (p,s) of [] -> readers rs -> rs @@ -162,7 +162,7 @@ readersForPathAndData (f,s) = filter (\r -> (rDetector r) f s) readers readJournalFile :: Maybe StorageFormat -> Maybe FilePath -> FilePath -> IO (Either String Journal) readJournalFile format rulesfile "-" = do hSetNewlineMode stdin universalNewlineMode - getContents >>= readJournal format rulesfile (Just "(stdin)") + getContents >>= readJournal format rulesfile (Just "-") readJournalFile format rulesfile f = do requireJournalFileExists f withFile f ReadMode $ \h -> do diff --git a/hledger-lib/Hledger/Read/CsvReader.hs b/hledger-lib/Hledger/Read/CsvReader.hs index f3cb1a262..687eff7a2 100644 --- a/hledger-lib/Hledger/Read/CsvReader.hs +++ b/hledger-lib/Hledger/Read/CsvReader.hs @@ -53,15 +53,16 @@ reader = Reader format detect parse format :: String format = "csv" --- | Does the given file path and data look like CSV ? +-- | Does the given file path and data look like it might be CSV ? detect :: FilePath -> String -> Bool -detect f _ = takeExtension f == '.':format +detect f s + | f /= "-" = takeExtension f == '.':format -- from a file: yes if the extension is .csv + | otherwise = length (filter (==',') s) >= 2 -- from stdin: yes if there are two or more commas -- | Parse and post-process a "Journal" from CSV data, or give an error. -- XXX currently ignores the string and reads from the file path parse :: Maybe FilePath -> FilePath -> String -> ErrorT String IO Journal -parse rulesfile f s = -- trace ("running "++format++" reader") $ - do +parse rulesfile f s = do r <- liftIO $ readJournalFromCsv rulesfile f s case r of Left e -> throwError e Right j -> return j @@ -78,7 +79,7 @@ parse rulesfile f s = -- trace ("running "++format++" reader") $ -- 5. convert the CSV records to a journal using the rules -- @ readJournalFromCsv :: Maybe FilePath -> FilePath -> String -> IO (Either String Journal) -readJournalFromCsv Nothing "-" _ = return $ Left "please use --rules-file when converting stdin" +readJournalFromCsv Nothing "-" _ = return $ Left "please use --rules-file when reading CSV from stdin" readJournalFromCsv mrulesfile csvfile csvdata = handle (\e -> return $ Left $ show (e :: IOException)) $ do let throwerr = throw.userError @@ -90,7 +91,7 @@ readJournalFromCsv mrulesfile csvfile csvdata = then hPrintf stderr "creating default conversion rules file %s, edit this file for better results\n" rulesfile else hPrintf stderr "using conversion rules file %s\n" rulesfile rules <- either (throwerr.show) id `fmap` parseRulesFile rulesfile - return $ dbg "" rules + dbgAtM 2 "rules" rules -- apply skip directive let skip = maybe 0 oneorerror $ getDirective "skip" rules @@ -99,8 +100,13 @@ readJournalFromCsv mrulesfile csvfile csvdata = oneorerror s = readDef (throwerr $ "could not parse skip value: " ++ show s) s -- parse csv - records <- (either throwerr id . validateCsv skip) `fmap` parseCsv csvfile csvdata - dbgAtM 1 "" $ take 3 records + -- parsec seems to fail if you pass it "-" here + let parsecfilename = if csvfile == "-" then "(stdin)" else csvfile + records <- (either throwerr id . + dbgAt 2 "validateCsv" . validateCsv skip . + dbgAt 2 "parseCsv") + `fmap` parseCsv parsecfilename csvdata + dbgAtM 1 "first 3 csv records" $ take 3 records -- identify header lines -- let (headerlines, datalines) = identifyHeaderLines records @@ -378,13 +384,13 @@ rulesp = do ,rconditionalblocks=reverse $ rconditionalblocks r } -blankorcommentline = pdbg 1 "trying blankorcommentline" >> choice' [blankline, commentline] +blankorcommentline = pdbg 3 "trying blankorcommentline" >> choice' [blankline, commentline] blankline = many spacenonewline >> newline >> return () "blank line" commentline = many spacenonewline >> commentchar >> restofline >> return () "comment line" commentchar = oneOf ";#" directive = do - pdbg 1 "trying directive" + pdbg 3 "trying directive" d <- choice' $ map string directives v <- (((char ':' >> many spacenonewline) <|> many1 spacenonewline) >> directiveval) <|> (optional (char ':') >> many spacenonewline >> eolof >> return "") @@ -404,7 +410,7 @@ directives = directiveval = anyChar `manyTill` eolof fieldnamelist = (do - pdbg 1 "trying fieldnamelist" + pdbg 3 "trying fieldnamelist" string "fields" optional $ char ':' many1 spacenonewline @@ -426,7 +432,7 @@ quotedfieldname = do barefieldname = many1 $ noneOf " \t\n,;#~" fieldassignment = do - pdbg 1 "trying fieldassignment" + pdbg 3 "trying fieldassignment" f <- journalfieldname assignmentseparator v <- fieldval @@ -466,7 +472,7 @@ fieldval = do anyChar `manyTill` eolof conditionalblock = do - pdbg 1 "trying conditionalblock" + pdbg 3 "trying conditionalblock" string "if" >> many spacenonewline >> optional newline ms <- many1 recordmatcher as <- many (many1 spacenonewline >> fieldassignment) diff --git a/hledger-lib/Hledger/Read/JournalReader.hs b/hledger-lib/Hledger/Read/JournalReader.hs index f5a19fcb9..4772498c1 100644 --- a/hledger-lib/Hledger/Read/JournalReader.hs +++ b/hledger-lib/Hledger/Read/JournalReader.hs @@ -76,15 +76,17 @@ reader = Reader format detect parse format :: String format = "journal" --- | Does the given file path and data provide hledger's journal file format ? +-- | Does the given file path and data look like it might be hledger's journal format ? detect :: FilePath -> String -> Bool -detect f _ = takeExtension f `elem` ['.':format, ".j"] +detect f s + | f /= "-" = takeExtension f `elem` ['.':format, ".j"] -- from a file: yes if the extension is .journal or .j + -- from stdin: yes if we can see something that looks like a journal entry (digits in column 0 with the next line indented) + | otherwise = isJust $ regexMatch "^[0-9]+.*\n[ \t]+" s -- | Parse and post-process a "Journal" from hledger's journal file -- format, or give an error. parse :: Maybe FilePath -> FilePath -> String -> ErrorT String IO Journal -parse _ = -- trace ("running "++format++" reader") . - parseJournalWith journal +parse _ = parseJournalWith journal -- parsing utils diff --git a/hledger-lib/Hledger/Read/TimelogReader.hs b/hledger-lib/Hledger/Read/TimelogReader.hs index c9d900b20..bc8e78a9e 100644 --- a/hledger-lib/Hledger/Read/TimelogReader.hs +++ b/hledger-lib/Hledger/Read/TimelogReader.hs @@ -49,6 +49,7 @@ module Hledger.Read.TimelogReader ( where import Control.Monad import Control.Monad.Error +import Data.List (isPrefixOf) import Test.HUnit import Text.ParserCombinators.Parsec hiding (parse) import System.FilePath @@ -68,16 +69,17 @@ reader = Reader format detect parse format :: String format = "timelog" --- | Does the given file path and data provide timeclock.el's timelog format ? +-- | Does the given file path and data look like it might be timeclock.el's timelog format ? detect :: FilePath -> String -> Bool -detect f _ = takeExtension f == '.':format +detect f s + | f /= "-" = takeExtension f == '.':format -- from a file: yes if the extension is .timelog + | otherwise = "i " `isPrefixOf` s || "o " `isPrefixOf` s -- from stdin: yes if it starts with "i " or "o " -- | Parse and post-process a "Journal" from timeclock.el's timelog -- format, saving the provided file path and the current time, or give an -- error. parse :: Maybe FilePath -> FilePath -> String -> ErrorT String IO Journal -parse _ = -- trace ("running "++format++" reader") . - parseJournalWith timelogFile +parse _ = parseJournalWith timelogFile timelogFile :: GenParser Char JournalContext (JournalUpdate,JournalContext) timelogFile = do items <- many timelogItem