lib: rewrite comment parsers [API]

- avoid the "re-parsing" of comments using the `match` parser combinator
This commit is contained in:
Alex Chen 2018-06-05 23:44:02 -06:00
parent d707b351cc
commit 9b6558401f
3 changed files with 241 additions and 208 deletions

View File

@ -183,6 +183,7 @@ instance NFData PostingType
type TagName = Text
type TagValue = Text
type Tag = (TagName, TagValue) -- ^ A tag name and (possibly empty) value.
type DateTag = (TagName, Day)
-- | The status of a transaction or posting, recorded with a status mark
-- (nothing, !, or *). What these mean is ultimately user defined.

View File

@ -14,6 +14,7 @@ Some of these might belong in Hledger.Read.JournalReader or Hledger.Read.
--- * module
{-# LANGUAGE CPP, BangPatterns, DeriveDataTypeable, RecordWildCards, NamedFieldPuns, NoMonoLocalBinds, ScopedTypeVariables, FlexibleContexts, TupleSections, OverloadedStrings #-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE PackageImports #-}
@ -81,12 +82,10 @@ module Hledger.Read.Common (
-- ** comments
multilinecommentp,
emptyorcommentlinep,
followingcommentp,
followingcommentandtagsp,
-- ** tags
commentTags,
tagsp,
followingcommentp,
transactioncommentp,
postingcommentp,
-- ** bracketed dates
bracketeddatetagsp
@ -98,6 +97,7 @@ import "base-compat-batteries" Prelude.Compat hiding (readFile)
import "base-compat-batteries" Control.Monad.Compat
import Control.Monad.Except (ExceptT(..), runExceptT, throwError) --, catchError)
import Control.Monad.State.Strict
import Data.Bifunctor (bimap, second)
import Data.Char
import Data.Data
import Data.Decimal (DecimalRaw (Decimal), Decimal)
@ -384,6 +384,8 @@ datep' mYear = do
"partial date "++dateStr++" found, but the current year is unknown"
where dateStr = show month ++ [sep] ++ show day
{-# INLINABLE datep' #-}
-- | Parse a date and time in YYYY/MM/DD HH:MM[:SS][+-ZZZZ] format.
-- Hyphen (-) and period (.) are also allowed as date separators.
-- The year may be omitted if a default year has been set.
@ -871,211 +873,235 @@ data AmbiguousNumber = AmbiguousNumber DigitGrp Char DigitGrp -- 1,000
multilinecommentp :: TextParser m ()
multilinecommentp = startComment *> anyLine `skipManyTill` endComment
where
startComment = string "comment" >> skipLine
endComment = eof <|> string "end comment" *> skipLine
startComment = string "comment" *> trailingSpaces
endComment = eof <|> string "end comment" *> trailingSpaces
skipLine = void $ skipMany spacenonewline *> newline
anyLine = takeWhileP Nothing (\c -> c /= '\n') *> newline
trailingSpaces = skipMany spacenonewline <* newline
anyLine = void $ takeWhileP Nothing (\c -> c /= '\n') *> newline
{-# INLINABLE multilinecommentp #-}
emptyorcommentlinep :: TextParser m ()
emptyorcommentlinep = do
skipMany spacenonewline
void linecommentp <|> void newline
skiplinecommentp <|> void newline
where
-- A line (file-level) comment can start with a semicolon, hash, or star
-- (allowing org nodes).
skiplinecommentp :: TextParser m ()
skiplinecommentp = do
satisfy $ \c -> c == ';' || c == '#' || c == '*'
void $ takeWhileP Nothing (\c -> c /= '\n')
optional newline
pure ()
-- | Parse a possibly multi-line comment following a semicolon.
followingcommentp :: TextParser m Text
followingcommentp = T.unlines . map snd <$> followingcommentlinesp
{-# INLINABLE emptyorcommentlinep #-}
followingcommentlinesp :: TextParser m [(SourcePos, Text)]
followingcommentlinesp = do
skipMany spacenonewline
samelineComment@(_, samelineCommentText)
<- try commentp <|> (,) <$> (getPosition <* eolof) <*> pure ""
newlineComments <- many $ try $ do
skipSome spacenonewline -- leading whitespace is required
commentp
if T.null samelineCommentText && null newlineComments
then pure []
else pure $ samelineComment : newlineComments
-- | Parse a possibly multi-line comment following a semicolon, and
-- any tags and/or posting dates within it. Posting dates can be
-- expressed with "date"/"date2" tags and/or bracketed dates. The
-- dates are parsed in full here so that errors are reported in the
-- right position. Missing years can be inferred if a default date is
-- provided.
-- A parser combinator for parsing (possibly multiline) comments
-- following journal items.
--
-- >>> rejp (followingcommentandtagsp (Just $ fromGregorian 2000 1 2)) "; a:b, date:3/4, [=5/6]"
-- Several journal items may be followed by comments, which begin with
-- semicolons and extend to the end of the line. Such comments may span
-- multiple lines, but comment lines below the journal item must be
-- preceeded by leading whitespace.
--
-- This parser combinator accepts a parser that consumes all input up
-- until the next newline. This parser should extract the "content" from
-- comments. The resulting parser returns this content plus the raw text
-- of the comment itself.
followingcommentp' :: (Monoid a) => TextParser m a -> TextParser m (Text, a)
followingcommentp' contentp = do
skipMany spacenonewline
sameLine <- try headerp *> match' contentp <|> pure ("", mempty)
_ <- eolof
lowerLines <- many $
try (skipSome spacenonewline *> headerp) *> match' contentp <* eolof
let (textLines, results) = unzip $ sameLine : lowerLines
strippedCommentText = T.unlines $ map T.strip textLines
result = mconcat results
pure (strippedCommentText, result)
where
headerp = char ';' *> skipMany spacenonewline
{-# INLINABLE followingcommentp' #-}
-- | Parse the text of a (possibly multiline) comment following a journal
-- item.
followingcommentp :: TextParser m Text
followingcommentp =
fst <$> followingcommentp' (void $ takeWhileP Nothing (/= '\n'))
{-# INLINABLE followingcommentp #-}
-- | Parse a transaction comment and extract its tags.
--
-- The first line of a transaction may be followed by comments, which
-- begin with semicolons and extend to the end of the line. Transaction
-- comments may span multiple lines, but comment lines below the
-- transaction must be preceeded by leading whitespace.
--
-- 2000/1/1 ; a transaction comment starting on the same line ...
-- ; extending to the next line
-- account1 $1
-- account2
--
-- Tags are name-value pairs.
--
-- >>> let getTags (_,tags) = tags
-- >>> let parseTags = fmap getTags . rtp transactioncommentp
--
-- >>> parseTags "; name1: val1, name2:all this is value2"
-- Right [("name1","val1"),("name2","all this is value2")]
--
-- A tag's name must be immediately followed by a colon, without
-- separating whitespace. The corresponding value consists of all the text
-- following the colon up until the next colon or newline, stripped of
-- leading and trailing whitespace.
--
transactioncommentp :: TextParser m (Text, [Tag])
transactioncommentp = followingcommentp' commenttagsp
{-# INLINABLE transactioncommentp #-}
commenttagsp :: TextParser m [Tag]
commenttagsp = do
tagName <- fmap (last . T.split isSpace)
$ takeWhileP Nothing (\c -> c /= ':' && c /= '\n')
atColon tagName <|> pure [] -- if not ':', then either '\n' or EOF
where
atColon :: Text -> TextParser m [Tag]
atColon name = char ':' *> do
if T.null name
then commenttagsp
else do
skipMany spacenonewline
val <- tagValue
let tag = (name, val)
(tag:) <$> commenttagsp
tagValue :: TextParser m Text
tagValue = do
val <- T.strip <$> takeWhileP Nothing (\c -> c /= ',' && c /= '\n')
_ <- optional $ char ','
pure val
{-# INLINABLE commenttagsp #-}
-- | Parse a posting comment and extract its tags and dates.
--
-- Postings may be followed by comments, which begin with semicolons and
-- extend to the end of the line. Posting comments may span multiple
-- lines, but comment lines below the posting must be preceeded by
-- leading whitespace.
--
-- 2000/1/1
-- account1 $1 ; a posting comment starting on the same line ...
-- ; extending to the next line
--
-- account2
-- ; a posting comment beginning on the next line
--
-- Tags are name-value pairs.
--
-- >>> let getTags (_,tags,_,_) = tags
-- >>> let parseTags = fmap getTags . rtp (postingcommentp Nothing)
--
-- >>> parseTags "; name1: val1, name2:all this is value2"
-- Right [("name1","val1"),("name2","all this is value2")]
--
-- A tag's name must be immediately followed by a colon, without
-- separating whitespace. The corresponding value consists of all the text
-- following the colon up until the next colon or newline, stripped of
-- leading and trailing whitespace.
--
-- Posting dates may be expressed with "date"/"date2" tags or with
-- bracketed date syntax. Posting dates will inherit their year from the
-- transaction date if the year is not specified. We throw parse errors on
-- invalid dates.
--
-- >>> let getDates (_,_,d1,d2) = (d1, d2)
-- >>> let parseDates = fmap getDates . rtp (postingcommentp (Just 2000))
--
-- >>> parseDates "; date: 1/2, date2: 1999/12/31"
-- Right (Just 2000-01-02,Just 1999-12-31)
-- >>> parseDates "; [1/2=1999/12/31]"
-- Right (Just 2000-01-02,Just 1999-12-31)
--
-- Example: tags, date tags, and bracketed dates
-- >>> rtp (postingcommentp (Just 2000)) "; a:b, date:3/4, [=5/6]"
-- Right ("a:b, date:3/4, [=5/6]\n",[("a","b"),("date","3/4")],Just 2000-03-04,Just 2000-05-06)
--
-- Year unspecified and no default provided -> unknown year error, at correct position:
-- >>> rejp (followingcommentandtagsp Nothing) " ; xxx date:3/4\n ; second line"
-- Left ...1:22...partial date 3/4 found, but the current year is unknown...
--
-- Date tag value contains trailing text - forgot the comma, confused:
-- the syntaxes ? We'll accept the leading date anyway
-- >>> rejp (followingcommentandtagsp (Just $ fromGregorian 2000 1 2)) "; date:3/4=5/6"
-- Example: extraction of dates from date tags ignores trailing text
-- >>> rtp (postingcommentp (Just 2000)) "; date:3/4=5/6"
-- Right ("date:3/4=5/6\n",[("date","3/4=5/6")],Just 2000-03-04,Nothing)
--
followingcommentandtagsp
:: Monad m
=> Maybe Day
-> ErroringJournalParser m (Text, [Tag], Maybe Day, Maybe Day)
followingcommentandtagsp mdefdate = do
-- pdbg 0 "followingcommentandtagsp"
postingcommentp
:: Maybe Year -> TextParser m (Text, [Tag], Maybe Day, Maybe Day)
postingcommentp mYear = do
(commentText, (tags, dateTags)) <-
followingcommentp' (commenttagsanddatesp mYear)
let mdate = fmap snd $ find ((=="date") .fst) dateTags
mdate2 = fmap snd $ find ((=="date2").fst) dateTags
pure (commentText, tags, mdate, mdate2)
{-# INLINABLE postingcommentp #-}
commentLines <- lift followingcommentlinesp
-- pdbg 0 $ "commentws:" ++ show commentLines
-- Reparse the comment for any tags.
tagsWithPositions <- case
traverse (runTextParserAt tagswithvaluepositions) commentLines of
Right tss -> pure $ concat tss
Left e -> throwError $ parseErrorPretty e
-- Extract date-tag style posting dates from the tags.
-- Use the transaction date for defaults, if provided.
let isDateLabel txt = txt == "date" || txt == "date2"
isDateTag = isDateLabel . fst . snd
tagDates <- case traverse tagDate $ filter isDateTag tagsWithPositions of
Right ds -> pure ds
Left e -> throwError $ parseErrorPretty e
-- Reparse the comment for any bracketed style posting dates.
-- Use the transaction date for defaults, if provided.
bracketedDates <- case
traverse (runTextParserAt (bracketedpostingdatesp mdefdate))
commentLines of
Right dss -> pure $ concat dss
Left e -> throwError $ parseErrorPretty e
let pdates = tagDates ++ bracketedDates
mdate = fmap snd $ find ((=="date") .fst) pdates
mdate2 = fmap snd $ find ((=="date2").fst) pdates
-- pdbg 0 $ "allDates: "++show pdates
let strippedComment = T.unlines $ map (T.strip . snd) commentLines
tags = map snd tagsWithPositions
-- pdbg 0 $ "comment:"++show strippedComment
pure (strippedComment, tags, mdate, mdate2)
commenttagsanddatesp
:: Maybe Year -> TextParser m ([Tag], [DateTag])
commenttagsanddatesp mYear = do
(txt, dateTags) <- match $ readUpTo ':'
-- next char is either ':' or '\n' (or EOF)
let tagName = last (T.split isSpace txt)
(fmap.second) (dateTags++) (atColon tagName) <|> pure ([], dateTags) -- if not ':', then either '\n' or EOF
where
runTextParserAt parser (pos, txt) =
runTextParser (setPosition pos *> parser) txt
readUpTo :: Char -> TextParser m [DateTag]
readUpTo end = do
void $ takeWhileP Nothing (\c -> c /= end && c /= '\n' && c /= '[')
-- if not '[' then ':' or '\n' or EOF
atBracket (readUpTo end) <|> pure []
tagDate :: (SourcePos, Tag)
-> Either (ParseError Char CustomErr) (TagName, Day)
tagDate (pos, (name, value)) =
fmap (name,) $ runTextParserAt (datep' myear) (pos, value)
where myear = fmap (first3 . toGregorian) mdefdate
atBracket :: TextParser m [DateTag] -> TextParser m [DateTag]
atBracket cont = do
-- Uses the fact that bracketed date-tags cannot contain newlines
dateTags <- option [] $ lookAhead (bracketeddatetagsp mYear)
_ <- char '['
dateTags' <- cont
pure $ dateTags ++ dateTags'
-- A transaction/posting comment must start with a semicolon. This parser
-- discards the leading whitespace of the comment and returns the source
-- position of the comment's first non-whitespace character.
commentp :: TextParser m (SourcePos, Text)
commentp = commentStartingWithp (==';')
-- A line (file-level) comment can start with a semicolon, hash, or star
-- (allowing org nodes). This parser discards the leading whitespace of
-- the comment and returns the source position of the comment's first
-- non-whitespace character.
linecommentp :: TextParser m (SourcePos, Text)
linecommentp =
commentStartingWithp $ \c -> c == ';' || c == '#' || c == '*'
commentStartingWithp :: (Char -> Bool) -> TextParser m (SourcePos, Text)
commentStartingWithp f = do
-- ptrace "commentStartingWith"
satisfy f
atColon :: Text -> TextParser m ([Tag], [DateTag])
atColon name = char ':' *> do
skipMany spacenonewline
startPos <- getPosition
content <- takeWhileP Nothing (\c -> c /= '\n')
optional newline
return (startPos, content)
(tags, dateTags) <- case name of
"" -> pure ([], [])
"date" -> dateValue name
"date2" -> dateValue name
_ -> tagValue name
_ <- optional $ char ','
bimap (tags++) (dateTags++) <$> commenttagsanddatesp mYear
--- ** tags
dateValue :: Text -> TextParser m ([Tag], [DateTag])
dateValue name = do
(txt, (date, dateTags)) <- match' $ do
date <- datep' mYear
dateTags <- readUpTo ','
pure (date, dateTags)
let val = T.strip txt
pure $ ( [(name, val)]
, (name, date) : dateTags )
-- | Extract any tags (name:value ended by comma or newline) embedded in a string.
--
-- >>> commentTags "a b:, c:c d:d, e"
-- [("b",""),("c","c d:d")]
--
-- >>> commentTags "a [1/1/1] [1/1] [1], [=1/1/1] [=1/1] [=1] [1/1=1/1/1] [1=1/1/1] b:c"
-- [("b","c")]
--
-- --[("date","1/1/1"),("date","1/1"),("date2","1/1/1"),("date2","1/1"),("date","1/1"),("date2","1/1/1"),("date","1"),("date2","1/1/1")]
--
-- >>> commentTags "\na b:, \nd:e, f"
-- [("b",""),("d","e")]
--
-- >>> commentTags ":value"
-- []
--
commentTags :: Text -> [Tag]
commentTags s = either (const []) id $ runTextParser tagsp s
tagValue :: Text -> TextParser m ([Tag], [DateTag])
tagValue name = do
(txt, dateTags) <- match' $ readUpTo ','
let val = T.strip txt
pure $ ( [(name, val)]
, dateTags )
-- | Parse all tags found in a string.
tagsp :: SimpleTextParser [Tag]
tagsp = map snd <$> tagswithvaluepositions
{-# INLINABLE commenttagsanddatesp #-}
tagswithvaluepositions :: SimpleTextParser [(SourcePos, Tag)]
tagswithvaluepositions = do
-- pdbg 0 $ "tagsp"
-- If we parse in a single pass, we cannot know whether some text
-- belongs to a tag label until we have reached a colon (in which case
-- it does) or whitespace (in which case it does not). Therefore, we
-- hold on to the text until we reach such a break point, and then
-- decide what to do.
potentialTagName <- tillNextBreak
atSpaceChar <|> atColon potentialTagName <|> atEof
where
isBreak :: Char -> Bool
isBreak c = isSpace c || c == ':'
tillNextBreak :: SimpleTextParser Text
tillNextBreak = takeWhileP Nothing (not . isBreak)
tagValue :: SimpleTextParser Text
tagValue = T.strip <$> takeWhileP Nothing (not . commaOrNewline)
where commaOrNewline c = c == ',' || c == '\n'
atSpaceChar :: SimpleTextParser [(SourcePos, Tag)]
atSpaceChar = skipSome spaceChar *> tagswithvaluepositions
atColon :: Text -> SimpleTextParser [(SourcePos, Tag)]
atColon tagName = do
char ':'
if T.null tagName
then tagswithvaluepositions
else do
pos <- getPosition
tagVal <- tagValue
let tag = (pos, (tagName, tagVal))
tags <- tagswithvaluepositions
pure $ tag : tags
atEof :: SimpleTextParser [(SourcePos, Tag)]
atEof = eof *> pure []
--- ** posting dates
-- | Parse all bracketed posting dates found in a string. The dates are
-- parsed fully to give useful errors. Missing years can be inferred only
-- if a default date is provided.
--
bracketedpostingdatesp :: Maybe Day -> SimpleTextParser [(TagName,Day)]
bracketedpostingdatesp mdefdate = do
-- pdbg 0 $ "bracketedpostingdatesp"
skipMany $ notChar '['
concat <$> sepEndBy (bracketeddatetagsp mdefdate <|> char '[' *> pure [])
(skipMany $ notChar '[')
--- ** bracketed dates
@ -1105,8 +1131,9 @@ bracketedpostingdatesp mdefdate = do
-- >>> either (Left . parseErrorPretty) Right $ rtp (bracketeddatetagsp Nothing) "[0123456789/-.=/-.=]"
-- Left ...1:13:...expecting month or day...
--
bracketeddatetagsp :: Maybe Day -> SimpleTextParser [(TagName, Day)]
bracketeddatetagsp mdefdate = do
bracketeddatetagsp
:: Maybe Year -> TextParser m [(TagName, Day)]
bracketeddatetagsp mYear1 = do
-- pdbg 0 "bracketeddatetagsp"
try $ do
s <- lookAhead
@ -1117,14 +1144,24 @@ bracketeddatetagsp mdefdate = do
-- Looks sufficiently like a bracketed date to commit to parsing a date
between (char '[') (char ']') $ do
let myear1 = fmap readYear mdefdate
md1 <- optional $ datep' myear1
md1 <- optional $ datep' mYear1
let myear2 = fmap readYear md1 <|> myear1
md2 <- optional $ char '=' *> datep' myear2
let mYear2 = fmap readYear md1 <|> mYear1
md2 <- optional $ char '=' *> datep' mYear2
pure $ catMaybes [("date",) <$> md1, ("date2",) <$> md2]
where
readYear = first3 . toGregorian
isBracketedDateChar c = isDigit c || isDateSepChar c || c == '='
{-# INLINABLE bracketeddatetagsp #-}
--- ** helper parsers
-- A version of `match` that is strict in the returned text
match' :: TextParser m a -> TextParser m (Text, a)
match' p = do
(!txt, p) <- match p
pure (txt, p)

View File

@ -248,12 +248,7 @@ accountdirectivep = do
macode' :: Maybe String <- (optional $ lift $ skipSome spacenonewline >> some digitChar)
let macode :: Maybe AccountCode = read <$> macode'
newline
_tags <- many $ do
startpos <- getPosition
l <- indentedlinep
case runTextParser (setPosition startpos >> tagsp) $ T.pack l of
Right ts -> return ts
Left _e -> return [] -- TODO throwError $ parseErrorPretty e
skipMany indentedlinep
modify' (\j -> j{jaccounts = (acct, macode) : jaccounts j})
@ -477,9 +472,9 @@ transactionp = do
status <- lift statusp <?> "cleared status"
code <- lift codep <?> "transaction code"
description <- T.strip <$> descriptionp
comment <- lift followingcommentp
let tags = commentTags comment
postings <- postingsp (Just date)
(comment, tags) <- lift transactioncommentp
let year = first3 $ toGregorian date
postings <- postingsp (Just year)
pos' <- getPosition
let sourcepos = journalSourcePos pos pos'
return $ txnTieKnot $ Transaction 0 sourcepos date edate status code description comment tags postings ""
@ -581,8 +576,8 @@ test_transactionp = do
-- Parse the following whitespace-beginning lines as postings, posting
-- tags, and/or comments (inferring year, if needed, from the given date).
postingsp :: MonadIO m => Maybe Day -> ErroringJournalParser m [Posting]
postingsp mdate = many (postingp mdate) <?> "postings"
postingsp :: MonadIO m => Maybe Year -> ErroringJournalParser m [Posting]
postingsp mTransactionYear = many (postingp mTransactionYear) <?> "postings"
-- linebeginningwithspaces :: Monad m => JournalParser m String
-- linebeginningwithspaces = do
@ -591,8 +586,8 @@ postingsp mdate = many (postingp mdate) <?> "postings"
-- cs <- lift restofline
-- return $ sp ++ (c:cs) ++ "\n"
postingp :: MonadIO m => Maybe Day -> ErroringJournalParser m Posting
postingp mtdate = do
postingp :: MonadIO m => Maybe Year -> ErroringJournalParser m Posting
postingp mTransactionYear = do
-- pdbg 0 "postingp"
(status, account) <- try $ do
lift (skipSome spacenonewline)
@ -605,7 +600,7 @@ postingp mtdate = do
massertion <- partialbalanceassertionp
_ <- fixedlotpricep
lift (skipMany spacenonewline)
(comment,tags,mdate,mdate2) <- followingcommentandtagsp mtdate
(comment,tags,mdate,mdate2) <- lift $ postingcommentp mTransactionYear
return posting
{ pdate=mdate
, pdate2=mdate2