csv: decimal-mark rule to help with number parsing
Journal keeps a new piece of parsing state, a decimal mark character, which can optionally be set to force the number format expected by all amount parsers.
This commit is contained in:
		
							parent
							
								
									4242a8592a
								
							
						
					
					
						commit
						524e23bc37
					
				| @ -175,6 +175,7 @@ instance Semigroup Journal where | ||||
|   j1 <> j2 = Journal { | ||||
|      jparsedefaultyear          = jparsedefaultyear          j2 | ||||
|     ,jparsedefaultcommodity     = jparsedefaultcommodity     j2 | ||||
|     ,jparsedecimalmark          = jparsedecimalmark          j2 | ||||
|     ,jparseparentaccounts       = jparseparentaccounts       j2 | ||||
|     ,jparsealiases              = jparsealiases              j2 | ||||
|     -- ,jparsetransactioncount     = jparsetransactioncount     j1 +  jparsetransactioncount     j2 | ||||
| @ -201,6 +202,7 @@ nulljournal :: Journal | ||||
| nulljournal = Journal { | ||||
|    jparsedefaultyear          = Nothing | ||||
|   ,jparsedefaultcommodity     = Nothing | ||||
|   ,jparsedecimalmark          = Nothing | ||||
|   ,jparseparentaccounts       = [] | ||||
|   ,jparsealiases              = [] | ||||
|   -- ,jparsetransactioncount     = 0 | ||||
|  | ||||
| @ -159,6 +159,12 @@ data AccountAlias = BasicAlias AccountName AccountName | ||||
| 
 | ||||
| data Side = L | R deriving (Eq,Show,Read,Ord,Generic) | ||||
| 
 | ||||
| -- | One of the decimal marks we support: either period or comma. | ||||
| type DecimalMark = Char | ||||
| 
 | ||||
| isDecimalMark :: Char -> Bool | ||||
| isDecimalMark c = c == '.' || c == ',' | ||||
| 
 | ||||
| -- | The basic numeric type used in amounts. | ||||
| type Quantity = Decimal | ||||
| -- The following is for hledger-web, and requires blaze-markup. | ||||
| @ -440,6 +446,7 @@ data Journal = Journal { | ||||
|   -- parsing-related data | ||||
|    jparsedefaultyear      :: Maybe Year                            -- ^ the current default year, specified by the most recent Y directive (or current date) | ||||
|   ,jparsedefaultcommodity :: Maybe (CommoditySymbol,AmountStyle)   -- ^ the current default commodity and its format, specified by the most recent D directive | ||||
|   ,jparsedecimalmark      :: Maybe DecimalMark                     -- ^ the character to always parse as decimal point, if set by CsvReader's decimal-mark (or a future journal directive) | ||||
|   ,jparseparentaccounts   :: [AccountName]                         -- ^ the current stack of parent account names, specified by apply account directives | ||||
|   ,jparsealiases          :: [AccountAlias]                        -- ^ the current account name aliases in effect, specified by alias directives (& options ?) | ||||
|   -- ,jparsetransactioncount :: Integer                               -- ^ the current count of transactions parsed so far (only journal format txns, currently) | ||||
|  | ||||
| @ -349,6 +349,15 @@ setYear y = modify' (\j -> j{jparsedefaultyear=Just y}) | ||||
| getYear :: JournalParser m (Maybe Year) | ||||
| getYear = fmap jparsedefaultyear get | ||||
| 
 | ||||
| -- | Get the decimal mark that has been specified for parsing, if any | ||||
| -- (eg by the CSV decimal-mark rule, or possibly a future journal directive). | ||||
| -- Return it as an AmountStyle that amount parsers can use. | ||||
| getDecimalMarkStyle :: JournalParser m (Maybe AmountStyle) | ||||
| getDecimalMarkStyle = do | ||||
|   Journal{jparsedecimalmark} <- get | ||||
|   let mdecmarkStyle = maybe Nothing (\c -> Just $ amountstyle{asdecimalpoint=Just c}) jparsedecimalmark | ||||
|   return mdecmarkStyle | ||||
| 
 | ||||
| setDefaultCommodityAndStyle :: (CommoditySymbol,AmountStyle) -> JournalParser m () | ||||
| setDefaultCommodityAndStyle cs = modify' (\j -> j{jparsedefaultcommodity=Just cs}) | ||||
| 
 | ||||
| @ -640,9 +649,26 @@ spaceandamountormissingp = | ||||
| -- or right, followed by, in any order: an optional transaction price, | ||||
| -- an optional ledger-style lot price, and/or an optional ledger-style | ||||
| -- lot date. A lot price and lot date will be ignored. | ||||
| -- | ||||
| -- To parse the amount's quantity (number) we need to know which character  | ||||
| -- represents a decimal mark. We find it in one of three ways: | ||||
| -- | ||||
| -- 1. If a decimal mark has been set explicitly in the journal parse state,  | ||||
| --    we use that | ||||
| -- | ||||
| -- 2. Or if the journal has a commodity declaration for the amount's commodity, | ||||
| --    we get the decimal mark from  that | ||||
| -- | ||||
| -- 3. Otherwise we will parse any valid decimal mark appearing in the | ||||
| --    number, as long as the number appears well formed. | ||||
| -- | ||||
| -- Note 3 is the default zero-config case; it means we automatically handle | ||||
| -- files with any supported decimal mark, but it also allows different decimal marks | ||||
| -- in  different amounts, which is a bit too loose. There's an open issue. | ||||
| amountp :: JournalParser m Amount | ||||
| amountp = label "amount" $ do | ||||
|   let spaces = lift $ skipNonNewlineSpaces | ||||
|   let  | ||||
|     spaces = lift $ skipNonNewlineSpaces | ||||
|   amount <- amountwithoutpricep <* spaces | ||||
|   (mprice, _elotprice, _elotdate) <- runPermutation $ | ||||
|     (,,) <$> toPermutationWithDefault Nothing (Just <$> priceamountp <* spaces) | ||||
| @ -650,9 +676,8 @@ amountp = label "amount" $ do | ||||
|          <*> toPermutationWithDefault Nothing (Just <$> lotdatep <* spaces) | ||||
|   pure $ amount { aprice = mprice } | ||||
| 
 | ||||
| -- XXX Just like amountp but don't allow lot prices. Needed for balanceassertionp. | ||||
| amountpnolotprices :: JournalParser m Amount | ||||
| amountpnolotprices = label "amount" $ do | ||||
| amountpnolotpricesp :: JournalParser m Amount | ||||
| amountpnolotpricesp = label "amount" $ do | ||||
|   let spaces = lift $ skipNonNewlineSpaces | ||||
|   amount <- amountwithoutpricep | ||||
|   spaces | ||||
| @ -669,7 +694,9 @@ amountwithoutpricep = do | ||||
|   leftsymbolamountp :: Bool -> (Decimal -> Decimal) -> JournalParser m Amount | ||||
|   leftsymbolamountp mult sign = label "amount" $ do | ||||
|     c <- lift commoditysymbolp | ||||
|     suggestedStyle <- getAmountStyle c | ||||
|     mdecmarkStyle <- getDecimalMarkStyle | ||||
|     mcommodityStyle <- getAmountStyle c | ||||
|     let suggestedStyle = mdecmarkStyle <|> mcommodityStyle | ||||
|     commodityspaced <- lift skipNonNewlineSpaces' | ||||
|     sign2 <- lift $ signp | ||||
|     offBeforeNum <- getOffset | ||||
| @ -692,14 +719,18 @@ amountwithoutpricep = do | ||||
|     case mSpaceAndCommodity of | ||||
|       -- right symbol amount | ||||
|       Just (commodityspaced, c) -> do | ||||
|         suggestedStyle <- getAmountStyle c | ||||
|         (q,prec,mdec,mgrps) <- lift $ interpretNumber numRegion suggestedStyle ambiguousRawNum mExponent | ||||
|         mdecmarkStyle <- getDecimalMarkStyle | ||||
|         mcommodityStyle <- getAmountStyle c | ||||
|         let msuggestedStyle = mdecmarkStyle <|> mcommodityStyle | ||||
|         (q,prec,mdec,mgrps) <- lift $ interpretNumber numRegion msuggestedStyle ambiguousRawNum mExponent | ||||
|         let s = amountstyle{ascommodityside=R, ascommodityspaced=commodityspaced, asprecision=prec, asdecimalpoint=mdec, asdigitgroups=mgrps} | ||||
|         return $ nullamt{acommodity=c, aquantity=sign q, aismultiplier=mult, astyle=s, aprice=Nothing} | ||||
|       -- no symbol amount | ||||
|       Nothing -> do | ||||
|         suggestedStyle <- getDefaultAmountStyle | ||||
|         (q,prec,mdec,mgrps) <- lift $ interpretNumber numRegion suggestedStyle ambiguousRawNum mExponent | ||||
|         mdecmarkStyle <- getDecimalMarkStyle | ||||
|         mcommodityStyle <- getDefaultAmountStyle | ||||
|         let msuggestedStyle = mdecmarkStyle <|> mcommodityStyle | ||||
|         (q,prec,mdec,mgrps) <- lift $ interpretNumber numRegion msuggestedStyle ambiguousRawNum mExponent | ||||
|         -- if a default commodity has been set, apply it and its style to this amount | ||||
|         -- (unless it's a multiplier in an automated posting) | ||||
|         defcs <- getDefaultCommodityAndStyle | ||||
| @ -716,8 +747,8 @@ amountwithoutpricep = do | ||||
|     -> Either AmbiguousNumber RawNumber | ||||
|     -> Maybe Integer | ||||
|     -> TextParser m (Quantity, AmountPrecision, Maybe Char, Maybe DigitGroupStyle) | ||||
|   interpretNumber posRegion suggestedStyle ambiguousNum mExp = | ||||
|     let rawNum = either (disambiguateNumber suggestedStyle) id ambiguousNum | ||||
|   interpretNumber posRegion msuggestedStyle ambiguousNum mExp = | ||||
|     let rawNum = either (disambiguateNumber msuggestedStyle) id ambiguousNum | ||||
|     in  case fromRawNumber rawNum mExp of | ||||
|           Left errMsg -> customFailure $ | ||||
|                            uncurry parseErrorAtRegion posRegion errMsg | ||||
| @ -776,7 +807,7 @@ balanceassertionp = do | ||||
|   lift skipNonNewlineSpaces | ||||
|   -- this amount can have a price; balance assertions ignore it, | ||||
|   -- but balance assignments will use it | ||||
|   a <- amountpnolotprices <?> "amount (for a balance assertion or assignment)" | ||||
|   a <- amountpnolotpricesp <?> "amount (for a balance assertion or assignment)" | ||||
|   return BalanceAssertion | ||||
|     { baamount    = a | ||||
|     , batotal     = istotal | ||||
| @ -884,13 +915,12 @@ fromRawNumber raw mExp = do | ||||
|       (a:b:cs) | a < b -> b:cs | ||||
|       gs               -> gs | ||||
| 
 | ||||
| 
 | ||||
| disambiguateNumber :: Maybe AmountStyle -> AmbiguousNumber -> RawNumber | ||||
| disambiguateNumber suggestedStyle (AmbiguousNumber grp1 sep grp2) = | ||||
| disambiguateNumber msuggestedStyle (AmbiguousNumber grp1 sep grp2) = | ||||
|   -- If present, use the suggested style to disambiguate; | ||||
|   -- otherwise, assume that the separator is a decimal point where possible. | ||||
|   if isDecimalPointChar sep && | ||||
|      maybe True (sep `isValidDecimalBy`) suggestedStyle | ||||
|   if isDecimalMark sep && | ||||
|      maybe True (sep `isValidDecimalBy`) msuggestedStyle | ||||
|   then NoSeparators grp1 (Just (sep, grp2)) | ||||
|   else WithSeparators sep [grp1, grp2] Nothing | ||||
|   where | ||||
| @ -925,7 +955,7 @@ rawnumberp = label "number" $ do | ||||
|   rawNumber <- fmap Right leadingDecimalPt <|> leadingDigits | ||||
| 
 | ||||
|   -- Guard against mistyped numbers | ||||
|   mExtraDecimalSep <- optional $ lookAhead $ satisfy isDecimalPointChar | ||||
|   mExtraDecimalSep <- optional $ lookAhead $ satisfy isDecimalMark | ||||
|   when (isJust mExtraDecimalSep) $ | ||||
|     Fail.fail "invalid number (invalid use of separator)" | ||||
| 
 | ||||
| @ -941,7 +971,7 @@ rawnumberp = label "number" $ do | ||||
| 
 | ||||
|   leadingDecimalPt :: TextParser m RawNumber | ||||
|   leadingDecimalPt = do | ||||
|     decPt <- satisfy isDecimalPointChar | ||||
|     decPt <- satisfy isDecimalMark | ||||
|     decGrp <- digitgroupp | ||||
|     pure $ NoSeparators mempty (Just (decPt, decGrp)) | ||||
| 
 | ||||
| @ -962,7 +992,7 @@ rawnumberp = label "number" $ do | ||||
| 
 | ||||
|   withDecimalPt :: Char -> [DigitGrp] -> TextParser m RawNumber | ||||
|   withDecimalPt digitSep digitGroups = do | ||||
|     decPt <- satisfy $ \c -> isDecimalPointChar c && c /= digitSep | ||||
|     decPt <- satisfy $ \c -> isDecimalMark c && c /= digitSep | ||||
|     decDigitGrp <- option mempty digitgroupp | ||||
| 
 | ||||
|     pure $ WithSeparators digitSep digitGroups (Just (decPt, decDigitGrp)) | ||||
| @ -974,21 +1004,17 @@ rawnumberp = label "number" $ do | ||||
|     -> [DigitGrp] | ||||
|     -> Either AmbiguousNumber RawNumber | ||||
|   withoutDecimalPt grp1 sep grp2 grps | ||||
|     | null grps && isDecimalPointChar sep = | ||||
|     | null grps && isDecimalMark sep = | ||||
|         Left $ AmbiguousNumber grp1 sep grp2 | ||||
|     | otherwise = Right $ WithSeparators sep (grp1:grp2:grps) Nothing | ||||
| 
 | ||||
|   trailingDecimalPt :: DigitGrp -> TextParser m RawNumber | ||||
|   trailingDecimalPt grp1 = do | ||||
|     decPt <- satisfy isDecimalPointChar | ||||
|     decPt <- satisfy isDecimalMark | ||||
|     pure $ NoSeparators grp1 (Just (decPt, mempty)) | ||||
| 
 | ||||
| 
 | ||||
| isDecimalPointChar :: Char -> Bool | ||||
| isDecimalPointChar c = c == '.' || c == ',' | ||||
| 
 | ||||
| isDigitSeparatorChar :: Char -> Bool | ||||
| isDigitSeparatorChar c = isDecimalPointChar c || c == ' ' | ||||
| isDigitSeparatorChar c = isDecimalMark c || c == ' ' | ||||
| 
 | ||||
| -- | Some kinds of number literal we might parse. | ||||
| data RawNumber | ||||
|  | ||||
| @ -78,7 +78,7 @@ import Text.Printf (printf) | ||||
| 
 | ||||
| import Hledger.Data | ||||
| import Hledger.Utils | ||||
| import Hledger.Read.Common (Reader(..),InputOpts(..),amountp, statusp, genericSourcePos, journalFinalise) | ||||
| import Hledger.Read.Common ( Reader(..),InputOpts(..), amountp, statusp, genericSourcePos, journalFinalise ) | ||||
| 
 | ||||
| --- ** doctest setup | ||||
| -- $setup | ||||
| @ -364,7 +364,7 @@ Grammar for the CSV conversion rules, more or less: | ||||
| 
 | ||||
| RULES: RULE* | ||||
| 
 | ||||
| RULE: ( FIELD-LIST | FIELD-ASSIGNMENT | CONDITIONAL-BLOCK | SKIP | NEWEST-FIRST | DATE-FORMAT | COMMENT | BLANK ) NEWLINE | ||||
| RULE: ( FIELD-LIST | FIELD-ASSIGNMENT | CONDITIONAL-BLOCK | SKIP | NEWEST-FIRST | DATE-FORMAT | DECIMAL-MARK | COMMENT | BLANK ) NEWLINE | ||||
| 
 | ||||
| FIELD-LIST: fields SPACE FIELD-NAME ( SPACE? , SPACE? FIELD-NAME )* | ||||
| 
 | ||||
| @ -462,6 +462,7 @@ directivep = (do | ||||
| directives :: [String] | ||||
| directives = | ||||
|   ["date-format" | ||||
|   ,"decimal-mark" | ||||
|   ,"separator" | ||||
|   -- ,"default-account" | ||||
|   -- ,"default-currency" | ||||
| @ -1048,9 +1049,10 @@ getBalance rules record currency n = do | ||||
| parseAmount :: CsvRules -> CsvRecord -> String -> String -> MixedAmount | ||||
| parseAmount rules record currency s = | ||||
|   either mkerror (Mixed . (:[])) $  -- PARTIAL: | ||||
|   runParser (evalStateT (amountp <* eof) nulljournal) "" $ | ||||
|   runParser (evalStateT (amountp <* eof) journalparsestate) "" $ | ||||
|   T.pack $ (currency++) $ simplifySign s | ||||
|   where | ||||
|     journalparsestate = nulljournal{jparsedecimalmark=parseDecimalMark rules} | ||||
|     mkerror e = error' $ unlines | ||||
|       ["error: could not parse \""++s++"\" as an amount" | ||||
|       ,showRecord record | ||||
| @ -1062,7 +1064,8 @@ parseAmount rules record currency s = | ||||
|         ++"or add or change your skip rule" | ||||
|       ] | ||||
| 
 | ||||
| -- XXX unify these | ||||
| -- XXX unify these ^v | ||||
| 
 | ||||
| -- | Almost but not quite the same as parseAmount. | ||||
| -- Given a non-empty amount string (from CSV) to parse, along with a | ||||
| -- possibly non-empty currency symbol to prepend, | ||||
| @ -1071,10 +1074,11 @@ parseAmount rules record currency s = | ||||
| parseBalanceAmount :: CsvRules -> CsvRecord -> String -> Int -> String -> Amount | ||||
| parseBalanceAmount rules record currency n s = | ||||
|   either (mkerror n s) id $ | ||||
|     runParser (evalStateT (amountp <* eof) nulljournal) "" $ | ||||
|     runParser (evalStateT (amountp <* eof) journalparsestate) "" $ | ||||
|     T.pack $ (currency++) $ simplifySign s | ||||
|                   -- the csv record's line number would be good | ||||
|   where | ||||
|     journalparsestate = nulljournal{jparsedecimalmark=parseDecimalMark rules} | ||||
|     mkerror n s e = error' $ unlines | ||||
|       ["error: could not parse \""++s++"\" as balance"++show n++" amount" | ||||
|       ,showRecord record | ||||
| @ -1083,6 +1087,15 @@ parseBalanceAmount rules record currency n s = | ||||
|       ,"the parse error is:      "++customErrorBundlePretty e | ||||
|       ] | ||||
| 
 | ||||
| -- Read a valid decimal mark from the decimal-mark rule, if any. | ||||
| -- If the rule is present with an invalid argument, raise an error. | ||||
| parseDecimalMark :: CsvRules -> Maybe DecimalMark | ||||
| parseDecimalMark rules = | ||||
|   case rules `csvRule` "decimal-mark" of | ||||
|     Nothing -> Nothing | ||||
|     Just [c] | isDecimalMark c -> Just c | ||||
|     Just s -> error' $ "decimal-mark's argument should be \".\" or \",\" (not \""++s++"\")" | ||||
| 
 | ||||
| -- | Make a balance assertion for the given amount, with the given parse | ||||
| -- position (to be shown in assertion failures), with the assertion type | ||||
| -- possibly set by a balance-type rule. | ||||
|  | ||||
| @ -45,7 +45,8 @@ these are described more fully below, after the examples: | ||||
| | [**`if` block**](#if-block)               | apply some rules to CSV records matched by patterns     | | ||||
| | [**`if` table**](#if-table)               | apply some rules to CSV records matched by patterns, alternate syntax | | ||||
| | [**`end`**](#end)                         | skip the remaining CSV records                          | | ||||
| | [**`date-format`**](#date-format)         | describe the format of CSV dates                        | | ||||
| | [**`date-format`**](#date-format)         | how to parse dates in CSV records                       | | ||||
| | [**`decimal-mark`**](#decimal-mark)       | the decimal mark used in CSV amounts, if ambiguous      | | ||||
| | [**`newest-first`**](#newest-first)       | disambiguate record order when there's only one date    | | ||||
| | [**`include`**](#include)                 | inline another CSV rules file                           | | ||||
| | [**`balance-type`**](#balance-type)       | choose which type of balance assignments to use         | | ||||
| @ -716,6 +717,21 @@ For the supported strptime syntax, see:\ | ||||
| <https://hackage.haskell.org/package/time/docs/Data-Time-Format.html#v:formatTime> | ||||
| 
 | ||||
| 
 | ||||
| ## `decimal-mark` | ||||
| 
 | ||||
| ```rules | ||||
| decimal-mark . | ||||
| ``` | ||||
| or: | ||||
| ```rules | ||||
| decimal-mark , | ||||
| ``` | ||||
| 
 | ||||
| hledger automatically accepts either period or comma as a decimal mark when parsing numbers | ||||
| (cf [Amounts](journal.html#amounts)). | ||||
| However if any numbers in the CSV contain digit group marks, such as thousand-separating commas, | ||||
| you should declare the decimal mark explicitly with this rule, to avoid misparsed numbers. | ||||
| 
 | ||||
| ## `newest-first` | ||||
| 
 | ||||
| hledger always sorts the generated transactions by date. | ||||
|  | ||||
| @ -906,6 +906,46 @@ $  ./csvtest.sh | ||||
| 
 | ||||
| >=0 | ||||
| 
 | ||||
| # 45. decimal-mark helps parse ambiguous decimals correctly | ||||
| < | ||||
| 2020-01-01,"1,000" | ||||
| 2020-01-02,"1.000" | ||||
| 
 | ||||
| RULES | ||||
| fields date,amount | ||||
| decimal-mark . | ||||
| 
 | ||||
| $  ./csvtest.sh | ||||
| 2020-01-01 | ||||
|     expenses:unknown       1,000.000 | ||||
|     income:unknown        -1,000.000 | ||||
| 
 | ||||
| 2020-01-02 | ||||
|     expenses:unknown           1.000 | ||||
|     income:unknown            -1.000 | ||||
| 
 | ||||
| >= | ||||
| 
 | ||||
| # 46.  | ||||
| < | ||||
| 2020-01-01,"1,000" | ||||
| 2020-01-02,"1.000" | ||||
| 
 | ||||
| RULES | ||||
| fields date,amount | ||||
| decimal-mark , | ||||
| 
 | ||||
| $  ./csvtest.sh | ||||
| 2020-01-01 | ||||
|     expenses:unknown           1,000 | ||||
|     income:unknown            -1,000 | ||||
| 
 | ||||
| 2020-01-02 | ||||
|     expenses:unknown       1.000,000 | ||||
|     income:unknown        -1.000,000 | ||||
| 
 | ||||
| >= | ||||
| 
 | ||||
| 
 | ||||
| ## .  | ||||
| #< | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user