csv: decimal-mark rule to help with number parsing

Journal keeps a new piece of parsing state, a decimal mark character, which can optionally be set to force the number format expected by all amount parsers.
2020-11-06 18:45:52 -10:00 · 2020-11-06 18:45:52 -10:00 · 524e23bc37
commit 524e23bc37
parent 4242a8592a
6 changed files with 136 additions and 32 deletions
--- a/hledger-lib/Hledger/Data/Journal.hs
+++ b/hledger-lib/Hledger/Data/Journal.hs
@ -175,6 +175,7 @@ instance Semigroup Journal where
  j1 <> j2 = Journal {
     jparsedefaultyear          = jparsedefaultyear          j2
    ,jparsedefaultcommodity     = jparsedefaultcommodity     j2
+    ,jparsedecimalmark          = jparsedecimalmark          j2
    ,jparseparentaccounts       = jparseparentaccounts       j2
    ,jparsealiases              = jparsealiases              j2
    -- ,jparsetransactioncount     = jparsetransactioncount     j1 +  jparsetransactioncount     j2
@ -201,6 +202,7 @@ nulljournal :: Journal
 nulljournal = Journal {
   jparsedefaultyear          = Nothing
  ,jparsedefaultcommodity     = Nothing
+  ,jparsedecimalmark          = Nothing
  ,jparseparentaccounts       = []
  ,jparsealiases              = []
  -- ,jparsetransactioncount     = 0
--- a/hledger-lib/Hledger/Data/Types.hs
+++ b/hledger-lib/Hledger/Data/Types.hs
@ -159,6 +159,12 @@ data AccountAlias = BasicAlias AccountName AccountName

 data Side = L | R deriving (Eq,Show,Read,Ord,Generic)

+-- | One of the decimal marks we support: either period or comma.
+type DecimalMark = Char
+
+isDecimalMark :: Char -> Bool
+isDecimalMark c = c == '.' || c == ','
+
 -- | The basic numeric type used in amounts.
 type Quantity = Decimal
 -- The following is for hledger-web, and requires blaze-markup.
@ -440,6 +446,7 @@ data Journal = Journal {
  -- parsing-related data
   jparsedefaultyear      :: Maybe Year                            -- ^ the current default year, specified by the most recent Y directive (or current date)
  ,jparsedefaultcommodity :: Maybe (CommoditySymbol,AmountStyle)   -- ^ the current default commodity and its format, specified by the most recent D directive
+  ,jparsedecimalmark      :: Maybe DecimalMark                     -- ^ the character to always parse as decimal point, if set by CsvReader's decimal-mark (or a future journal directive)
  ,jparseparentaccounts   :: [AccountName]                         -- ^ the current stack of parent account names, specified by apply account directives
  ,jparsealiases          :: [AccountAlias]                        -- ^ the current account name aliases in effect, specified by alias directives (& options ?)
  -- ,jparsetransactioncount :: Integer                               -- ^ the current count of transactions parsed so far (only journal format txns, currently)
--- a/hledger-lib/Hledger/Read/Common.hs
+++ b/hledger-lib/Hledger/Read/Common.hs
@ -349,6 +349,15 @@ setYear y = modify' (\j -> j{jparsedefaultyear=Just y})
 getYear :: JournalParser m (Maybe Year)
 getYear = fmap jparsedefaultyear get

+-- | Get the decimal mark that has been specified for parsing, if any
+-- (eg by the CSV decimal-mark rule, or possibly a future journal directive).
+-- Return it as an AmountStyle that amount parsers can use.
+getDecimalMarkStyle :: JournalParser m (Maybe AmountStyle)
+getDecimalMarkStyle = do
+  Journal{jparsedecimalmark} <- get
+  let mdecmarkStyle = maybe Nothing (\c -> Just $ amountstyle{asdecimalpoint=Just c}) jparsedecimalmark
+  return mdecmarkStyle
+
 setDefaultCommodityAndStyle :: (CommoditySymbol,AmountStyle) -> JournalParser m ()
 setDefaultCommodityAndStyle cs = modify' (\j -> j{jparsedefaultcommodity=Just cs})

@ -640,9 +649,26 @@ spaceandamountormissingp =
 -- or right, followed by, in any order: an optional transaction price,
 -- an optional ledger-style lot price, and/or an optional ledger-style
 -- lot date. A lot price and lot date will be ignored.
+--
+-- To parse the amount's quantity (number) we need to know which character 
+-- represents a decimal mark. We find it in one of three ways:
+--
+-- 1. If a decimal mark has been set explicitly in the journal parse state, 
+--    we use that
+--
+-- 2. Or if the journal has a commodity declaration for the amount's commodity,
+--    we get the decimal mark from  that
+--
+-- 3. Otherwise we will parse any valid decimal mark appearing in the
+--    number, as long as the number appears well formed.
+--
+-- Note 3 is the default zero-config case; it means we automatically handle
+-- files with any supported decimal mark, but it also allows different decimal marks
+-- in  different amounts, which is a bit too loose. There's an open issue.
 amountp :: JournalParser m Amount
 amountp = label "amount" $ do
-  let spaces = lift $ skipNonNewlineSpaces
+  let 
+    spaces = lift $ skipNonNewlineSpaces
  amount <- amountwithoutpricep <* spaces
  (mprice, _elotprice, _elotdate) <- runPermutation $
    (,,) <$> toPermutationWithDefault Nothing (Just <$> priceamountp <* spaces)
@ -650,9 +676,8 @@ amountp = label "amount" $ do
         <*> toPermutationWithDefault Nothing (Just <$> lotdatep <* spaces)
  pure $ amount { aprice = mprice }

-- XXX Just like amountp but don't allow lot prices. Needed for balanceassertionp.
-amountpnolotprices :: JournalParser m Amount
-amountpnolotprices = label "amount" $ do
+amountpnolotpricesp :: JournalParser m Amount
+amountpnolotpricesp = label "amount" $ do
  let spaces = lift $ skipNonNewlineSpaces
  amount <- amountwithoutpricep
  spaces
@ -669,7 +694,9 @@ amountwithoutpricep = do
  leftsymbolamountp :: Bool -> (Decimal -> Decimal) -> JournalParser m Amount
  leftsymbolamountp mult sign = label "amount" $ do
    c <- lift commoditysymbolp
-    suggestedStyle <- getAmountStyle c
+    mdecmarkStyle <- getDecimalMarkStyle
+    mcommodityStyle <- getAmountStyle c
+    let suggestedStyle = mdecmarkStyle <|> mcommodityStyle
    commodityspaced <- lift skipNonNewlineSpaces'
    sign2 <- lift $ signp
    offBeforeNum <- getOffset
@ -692,14 +719,18 @@ amountwithoutpricep = do
    case mSpaceAndCommodity of
      -- right symbol amount
      Just (commodityspaced, c) -> do
-        suggestedStyle <- getAmountStyle c
-        (q,prec,mdec,mgrps) <- lift $ interpretNumber numRegion suggestedStyle ambiguousRawNum mExponent
+        mdecmarkStyle <- getDecimalMarkStyle
+        mcommodityStyle <- getAmountStyle c
+        let msuggestedStyle = mdecmarkStyle <|> mcommodityStyle
+        (q,prec,mdec,mgrps) <- lift $ interpretNumber numRegion msuggestedStyle ambiguousRawNum mExponent
        let s = amountstyle{ascommodityside=R, ascommodityspaced=commodityspaced, asprecision=prec, asdecimalpoint=mdec, asdigitgroups=mgrps}
        return $ nullamt{acommodity=c, aquantity=sign q, aismultiplier=mult, astyle=s, aprice=Nothing}
      -- no symbol amount
      Nothing -> do
-        suggestedStyle <- getDefaultAmountStyle
-        (q,prec,mdec,mgrps) <- lift $ interpretNumber numRegion suggestedStyle ambiguousRawNum mExponent
+        mdecmarkStyle <- getDecimalMarkStyle
+        mcommodityStyle <- getDefaultAmountStyle
+        let msuggestedStyle = mdecmarkStyle <|> mcommodityStyle
+        (q,prec,mdec,mgrps) <- lift $ interpretNumber numRegion msuggestedStyle ambiguousRawNum mExponent
        -- if a default commodity has been set, apply it and its style to this amount
        -- (unless it's a multiplier in an automated posting)
        defcs <- getDefaultCommodityAndStyle
@ -716,8 +747,8 @@ amountwithoutpricep = do
    -> Either AmbiguousNumber RawNumber
    -> Maybe Integer
    -> TextParser m (Quantity, AmountPrecision, Maybe Char, Maybe DigitGroupStyle)
-  interpretNumber posRegion suggestedStyle ambiguousNum mExp =
-    let rawNum = either (disambiguateNumber suggestedStyle) id ambiguousNum
+  interpretNumber posRegion msuggestedStyle ambiguousNum mExp =
+    let rawNum = either (disambiguateNumber msuggestedStyle) id ambiguousNum
    in  case fromRawNumber rawNum mExp of
          Left errMsg -> customFailure $
                           uncurry parseErrorAtRegion posRegion errMsg
@ -776,7 +807,7 @@ balanceassertionp = do
  lift skipNonNewlineSpaces
  -- this amount can have a price; balance assertions ignore it,
  -- but balance assignments will use it
-  a <- amountpnolotprices <?> "amount (for a balance assertion or assignment)"
+  a <- amountpnolotpricesp <?> "amount (for a balance assertion or assignment)"
  return BalanceAssertion
    { baamount    = a
    , batotal     = istotal
@ -884,13 +915,12 @@ fromRawNumber raw mExp = do
      (a:b:cs) | a < b -> b:cs
      gs               -> gs

-
 disambiguateNumber :: Maybe AmountStyle -> AmbiguousNumber -> RawNumber
-disambiguateNumber suggestedStyle (AmbiguousNumber grp1 sep grp2) =
+disambiguateNumber msuggestedStyle (AmbiguousNumber grp1 sep grp2) =
  -- If present, use the suggested style to disambiguate;
  -- otherwise, assume that the separator is a decimal point where possible.
-  if isDecimalPointChar sep &&
-     maybe True (sep `isValidDecimalBy`) suggestedStyle
+  if isDecimalMark sep &&
+     maybe True (sep `isValidDecimalBy`) msuggestedStyle
  then NoSeparators grp1 (Just (sep, grp2))
  else WithSeparators sep [grp1, grp2] Nothing
  where
@ -925,7 +955,7 @@ rawnumberp = label "number" $ do
  rawNumber <- fmap Right leadingDecimalPt <|> leadingDigits

  -- Guard against mistyped numbers
-  mExtraDecimalSep <- optional $ lookAhead $ satisfy isDecimalPointChar
+  mExtraDecimalSep <- optional $ lookAhead $ satisfy isDecimalMark
  when (isJust mExtraDecimalSep) $
    Fail.fail "invalid number (invalid use of separator)"

@ -941,7 +971,7 @@ rawnumberp = label "number" $ do

  leadingDecimalPt :: TextParser m RawNumber
  leadingDecimalPt = do
-    decPt <- satisfy isDecimalPointChar
+    decPt <- satisfy isDecimalMark
    decGrp <- digitgroupp
    pure $ NoSeparators mempty (Just (decPt, decGrp))

@ -962,7 +992,7 @@ rawnumberp = label "number" $ do

  withDecimalPt :: Char -> [DigitGrp] -> TextParser m RawNumber
  withDecimalPt digitSep digitGroups = do
-    decPt <- satisfy $ \c -> isDecimalPointChar c && c /= digitSep
+    decPt <- satisfy $ \c -> isDecimalMark c && c /= digitSep
    decDigitGrp <- option mempty digitgroupp

    pure $ WithSeparators digitSep digitGroups (Just (decPt, decDigitGrp))
@ -974,21 +1004,17 @@ rawnumberp = label "number" $ do
    -> [DigitGrp]
    -> Either AmbiguousNumber RawNumber
  withoutDecimalPt grp1 sep grp2 grps
-    | null grps && isDecimalPointChar sep =
+    | null grps && isDecimalMark sep =
        Left $ AmbiguousNumber grp1 sep grp2
    | otherwise = Right $ WithSeparators sep (grp1:grp2:grps) Nothing

  trailingDecimalPt :: DigitGrp -> TextParser m RawNumber
  trailingDecimalPt grp1 = do
-    decPt <- satisfy isDecimalPointChar
+    decPt <- satisfy isDecimalMark
    pure $ NoSeparators grp1 (Just (decPt, mempty))

-
-isDecimalPointChar :: Char -> Bool
-isDecimalPointChar c = c == '.' || c == ','
-
 isDigitSeparatorChar :: Char -> Bool
-isDigitSeparatorChar c = isDecimalPointChar c || c == ' '
+isDigitSeparatorChar c = isDecimalMark c || c == ' '

 -- | Some kinds of number literal we might parse.
 data RawNumber
--- a/hledger-lib/Hledger/Read/CsvReader.hs
+++ b/hledger-lib/Hledger/Read/CsvReader.hs
@ -78,7 +78,7 @@ import Text.Printf (printf)

 import Hledger.Data
 import Hledger.Utils
-import Hledger.Read.Common (Reader(..),InputOpts(..),amountp, statusp, genericSourcePos, journalFinalise)
+import Hledger.Read.Common ( Reader(..),InputOpts(..), amountp, statusp, genericSourcePos, journalFinalise )

 --- ** doctest setup
 -- $setup
@ -364,7 +364,7 @@ Grammar for the CSV conversion rules, more or less:

 RULES: RULE*

-RULE: ( FIELD-LIST | FIELD-ASSIGNMENT | CONDITIONAL-BLOCK | SKIP | NEWEST-FIRST | DATE-FORMAT | COMMENT | BLANK ) NEWLINE
+RULE: ( FIELD-LIST | FIELD-ASSIGNMENT | CONDITIONAL-BLOCK | SKIP | NEWEST-FIRST | DATE-FORMAT | DECIMAL-MARK | COMMENT | BLANK ) NEWLINE

 FIELD-LIST: fields SPACE FIELD-NAME ( SPACE? , SPACE? FIELD-NAME )*

@ -462,6 +462,7 @@ directivep = (do
 directives :: [String]
 directives =
  ["date-format"
+  ,"decimal-mark"
  ,"separator"
  -- ,"default-account"
  -- ,"default-currency"
@ -1048,9 +1049,10 @@ getBalance rules record currency n = do
 parseAmount :: CsvRules -> CsvRecord -> String -> String -> MixedAmount
 parseAmount rules record currency s =
  either mkerror (Mixed . (:[])) $  -- PARTIAL:
-  runParser (evalStateT (amountp <* eof) nulljournal) "" $
+  runParser (evalStateT (amountp <* eof) journalparsestate) "" $
  T.pack $ (currency++) $ simplifySign s
  where
+    journalparsestate = nulljournal{jparsedecimalmark=parseDecimalMark rules}
    mkerror e = error' $ unlines
      ["error: could not parse \""++s++"\" as an amount"
      ,showRecord record
@ -1062,7 +1064,8 @@ parseAmount rules record currency s =
        ++"or add or change your skip rule"
      ]

-- XXX unify these
+-- XXX unify these ^v
+
 -- | Almost but not quite the same as parseAmount.
 -- Given a non-empty amount string (from CSV) to parse, along with a
 -- possibly non-empty currency symbol to prepend,
@ -1071,10 +1074,11 @@ parseAmount rules record currency s =
 parseBalanceAmount :: CsvRules -> CsvRecord -> String -> Int -> String -> Amount
 parseBalanceAmount rules record currency n s =
  either (mkerror n s) id $
-    runParser (evalStateT (amountp <* eof) nulljournal) "" $
+    runParser (evalStateT (amountp <* eof) journalparsestate) "" $
    T.pack $ (currency++) $ simplifySign s
                  -- the csv record's line number would be good
  where
+    journalparsestate = nulljournal{jparsedecimalmark=parseDecimalMark rules}
    mkerror n s e = error' $ unlines
      ["error: could not parse \""++s++"\" as balance"++show n++" amount"
      ,showRecord record
@ -1083,6 +1087,15 @@ parseBalanceAmount rules record currency n s =
      ,"the parse error is:      "++customErrorBundlePretty e
      ]

+-- Read a valid decimal mark from the decimal-mark rule, if any.
+-- If the rule is present with an invalid argument, raise an error.
+parseDecimalMark :: CsvRules -> Maybe DecimalMark
+parseDecimalMark rules =
+  case rules `csvRule` "decimal-mark" of
+    Nothing -> Nothing
+    Just [c] | isDecimalMark c -> Just c
+    Just s -> error' $ "decimal-mark's argument should be \".\" or \",\" (not \""++s++"\")"
+
 -- | Make a balance assertion for the given amount, with the given parse
 -- position (to be shown in assertion failures), with the assertion type
 -- possibly set by a balance-type rule.
--- a/hledger-lib/hledger_csv.m4.md
+++ b/hledger-lib/hledger_csv.m4.md
@ -45,7 +45,8 @@ these are described more fully below, after the examples:
 | [**`if` block**](#if-block)               | apply some rules to CSV records matched by patterns     |
 | [**`if` table**](#if-table)               | apply some rules to CSV records matched by patterns, alternate syntax |
 | [**`end`**](#end)                         | skip the remaining CSV records                          |
-| [**`date-format`**](#date-format)         | describe the format of CSV dates                        |
+| [**`date-format`**](#date-format)         | how to parse dates in CSV records                       |
+| [**`decimal-mark`**](#decimal-mark)       | the decimal mark used in CSV amounts, if ambiguous      |
 | [**`newest-first`**](#newest-first)       | disambiguate record order when there's only one date    |
 | [**`include`**](#include)                 | inline another CSV rules file                           |
 | [**`balance-type`**](#balance-type)       | choose which type of balance assignments to use         |
@ -716,6 +717,21 @@ For the supported strptime syntax, see:\
 <https://hackage.haskell.org/package/time/docs/Data-Time-Format.html#v:formatTime>


+## `decimal-mark`
+
+```rules
+decimal-mark .
+```
+or:
+```rules
+decimal-mark ,
+```
+
+hledger automatically accepts either period or comma as a decimal mark when parsing numbers
+(cf [Amounts](journal.html#amounts)).
+However if any numbers in the CSV contain digit group marks, such as thousand-separating commas,
+you should declare the decimal mark explicitly with this rule, to avoid misparsed numbers.
+
 ## `newest-first`

 hledger always sorts the generated transactions by date.
--- a/tests/csv.test
+++ b/tests/csv.test
@ -906,6 +906,46 @@ $  ./csvtest.sh

 >=0

+# 45. decimal-mark helps parse ambiguous decimals correctly
+<
+2020-01-01,"1,000"
+2020-01-02,"1.000"
+
+RULES
+fields date,amount
+decimal-mark .
+
+$  ./csvtest.sh
+2020-01-01
+    expenses:unknown       1,000.000
+    income:unknown        -1,000.000
+
+2020-01-02
+    expenses:unknown           1.000
+    income:unknown            -1.000
+
+>=
+
+# 46. 
+<
+2020-01-01,"1,000"
+2020-01-02,"1.000"
+
+RULES
+fields date,amount
+decimal-mark ,
+
+$  ./csvtest.sh
+2020-01-01
+    expenses:unknown           1,000
+    income:unknown            -1,000
+
+2020-01-02
+    expenses:unknown       1.000,000
+    income:unknown        -1.000,000
+
+>=
+

 ## . 
 #<