lib: refactor the raw number parser [API change]
This commit is contained in:
		
							parent
							
								
									93fbac99d3
								
							
						
					
					
						commit
						121ba92ade
					
				| @ -101,12 +101,17 @@ import Control.Monad.State.Strict | |||||||
| import Data.Bifunctor | import Data.Bifunctor | ||||||
| import Data.Char | import Data.Char | ||||||
| import Data.Data | import Data.Data | ||||||
|  | import Data.Decimal (DecimalRaw (Decimal), Decimal) | ||||||
| import Data.Default | import Data.Default | ||||||
| import Data.Functor.Identity | import Data.Functor.Identity | ||||||
| import Data.List.Compat | import Data.List.Compat | ||||||
| import Data.List.NonEmpty (NonEmpty(..)) | import Data.List.NonEmpty (NonEmpty(..)) | ||||||
| import Data.Maybe | import Data.Maybe | ||||||
|  | #if !(MIN_VERSION_base(4,11,0)) | ||||||
|  | import Data.Monoid | ||||||
|  | #endif | ||||||
| import qualified Data.Map as M | import qualified Data.Map as M | ||||||
|  | import qualified Data.Semigroup as Sem | ||||||
| import Data.Text (Text) | import Data.Text (Text) | ||||||
| import qualified Data.Text as T | import qualified Data.Text as T | ||||||
| import Data.Time.Calendar | import Data.Time.Calendar | ||||||
| @ -709,93 +714,201 @@ exponentp = do | |||||||
|     return $ bimap (* 10^^exp) (max 0 . subtract exp) |     return $ bimap (* 10^^exp) (max 0 . subtract exp) | ||||||
|     <?> "exponentp" |     <?> "exponentp" | ||||||
| 
 | 
 | ||||||
| -- | Interpret the raw parts of a number, using the provided amount style if any, | -- | Interpret a raw number as a decimal number, and identify the decimal | ||||||
| -- determining the decimal point character and digit groups where possible. | -- point charcter and digit separating scheme. There is only one ambiguous | ||||||
|  | -- case: when there is just a single separator between two digit groups. | ||||||
|  | -- Disambiguate using an amount style, if provided; otherwise, assume that | ||||||
|  | -- the separator is a decimal point. | ||||||
|  | -- | ||||||
| -- Returns: | -- Returns: | ||||||
| -- - the decimal number | -- - the decimal number | ||||||
| -- - the precision (number of digits after the decimal point)   | -- - the precision (number of digits after the decimal point)   | ||||||
| -- - the decimal point character, if any | -- - the decimal point character, if any | ||||||
| -- - the digit group style, if any (digit group character and sizes of digit groups) | -- - the digit group style, if any (digit group character and sizes of digit groups) | ||||||
| fromRawNumber :: Maybe AmountStyle -> Bool -> (Maybe Char, [String], Maybe (Char, String)) -> (Quantity, Int, Maybe Char, Maybe DigitGroupStyle) | fromRawNumber | ||||||
| fromRawNumber suggestedStyle negated raw = (quantity, precision, mdecimalpoint, mgrps) where |   :: Maybe AmountStyle | ||||||
|     -- unpack with a hint if useful |   -> Bool | ||||||
|     (mseparator, intparts, mdecimalpoint, frac) = |   -> RawNumber | ||||||
|             case raw of |   -> (Quantity, Int, Maybe Char, Maybe DigitGroupStyle) | ||||||
|                 -- If the number consists of exactly two digit groups | fromRawNumber suggestedStyle negated raw = case raw of | ||||||
|                 -- separated by a valid decimal point character, we assume |  | ||||||
|                 -- that the character represents a decimal point. |  | ||||||
|                 (Just s, [firstGroup, lastGroup], Nothing) |  | ||||||
|                     | s `elem` decimalPointChars && maybe True (`asdecimalcheck` s) suggestedStyle -> |  | ||||||
|                         (Nothing, [firstGroup], Just s, lastGroup) |  | ||||||
| 
 | 
 | ||||||
|                 (firstSep, digitGroups, Nothing) -> (firstSep, digitGroups, Nothing, []) |   LeadingDecimalPt decPt digitGrp -> | ||||||
|                 (firstSep, digitGroups, Just (d, frac)) -> (firstSep, digitGroups, Just d, frac) |     let quantity = sign $ Decimal (fromIntegral precision) | ||||||
|  |                                   (digitGroupNumber digitGrp) | ||||||
|  |         precision = digitGroupLength digitGrp | ||||||
|  |     in  (quantity, precision, Just decPt, Nothing) | ||||||
| 
 | 
 | ||||||
|     -- get the digit group sizes and digit group style if any |   TrailingDecimalPt digitGrp decPt -> | ||||||
|     groupsizes = reverse $ case map length intparts of |     let quantity = sign $ Decimal (fromIntegral precision) | ||||||
|  |                                   (digitGroupNumber digitGrp) | ||||||
|  |         precision = 0 | ||||||
|  |     in  (quantity, precision, Just decPt, Nothing) | ||||||
|  | 
 | ||||||
|  |   NoSeparators digitGrp -> | ||||||
|  |     let quantity = sign $ Decimal (fromIntegral precision) | ||||||
|  |                                   (digitGroupNumber digitGrp) | ||||||
|  |         precision = 0 | ||||||
|  |     in  (quantity, precision, Nothing, Nothing) | ||||||
|  | 
 | ||||||
|  |   AmbiguousNumber digitGrp1 sep digitGrp2 -> | ||||||
|  |     -- If present, use the suggested style to disambiguate; | ||||||
|  |     -- otherwise, assume that the separator is a decimal point where possible. | ||||||
|  |     if isDecimalPointChar sep && maybe True (sep `isValidDecimalBy`) suggestedStyle | ||||||
|  | 
 | ||||||
|  |     then -- Assuming that the separator is a decimal point | ||||||
|  |       let quantity = sign $ Decimal (fromIntegral precision) | ||||||
|  |                                     (digitGroupNumber $ digitGrp1 <> digitGrp2) | ||||||
|  |           precision = digitGroupLength digitGrp2 | ||||||
|  |       in  (quantity, precision, Just sep, Nothing) | ||||||
|  | 
 | ||||||
|  |     else -- Assuming that the separator is digit separator | ||||||
|  |       let quantity = sign $ Decimal (fromIntegral precision) | ||||||
|  |                                     (digitGroupNumber $ digitGrp1 <> digitGrp2) | ||||||
|  |           precision = 0 | ||||||
|  |           digitGroupStyle = Just $ | ||||||
|  |             DigitGroups sep (groupSizes $ [digitGrp1, digitGrp2]) | ||||||
|  |       in  (quantity, precision, Nothing, digitGroupStyle) | ||||||
|  | 
 | ||||||
|  |   DigitSeparators digitSep digitGrps -> | ||||||
|  |     let quantity = sign $ Decimal (fromIntegral precision) | ||||||
|  |                                   (digitGroupNumber $ mconcat digitGrps) | ||||||
|  |         precision = 0 | ||||||
|  |         digitGroupStyle = Just $ DigitGroups digitSep (groupSizes digitGrps) | ||||||
|  |     in  (quantity, precision, Nothing, digitGroupStyle) | ||||||
|  | 
 | ||||||
|  |   BothSeparators digitSep digitGrps decPt decimalGrp -> | ||||||
|  |     let quantity = | ||||||
|  |           sign $ Decimal (fromIntegral precision) | ||||||
|  |                          (digitGroupNumber $ mconcat digitGrps <> decimalGrp) | ||||||
|  |         precision = digitGroupLength decimalGrp | ||||||
|  |         digitGroupStyle = Just $ DigitGroups digitSep (groupSizes digitGrps) | ||||||
|  |     in  (quantity, precision, Just decPt, digitGroupStyle) | ||||||
|  | 
 | ||||||
|  |   where | ||||||
|  | 
 | ||||||
|  |     sign :: Decimal -> Decimal | ||||||
|  |     sign = if negated then negate else id | ||||||
|  | 
 | ||||||
|  |     -- Outputs digit group sizes from least significant to most significant | ||||||
|  |     groupSizes :: [DigitGrp] -> [Int] | ||||||
|  |     groupSizes digitGrps = reverse $ case map digitGroupLength digitGrps of | ||||||
|       (a:b:cs) | a < b -> b:cs |       (a:b:cs) | a < b -> b:cs | ||||||
|       gs               -> gs |       gs               -> gs | ||||||
|     mgrps = (`DigitGroups` groupsizes) <$> mseparator |  | ||||||
| 
 | 
 | ||||||
|     -- put the parts back together without digit group separators, get the precision and parse the value |     isValidDecimalBy :: Char -> AmountStyle -> Bool | ||||||
|     repr = (if negated then "-" else "") ++ "0" ++ concat intparts ++ (if null frac then "" else "." ++ frac) |     isValidDecimalBy c = \case | ||||||
|     quantity = read repr |       AmountStyle{asdecimalpoint = Just d} -> d == c | ||||||
|     precision = length frac |       AmountStyle{asdigitgroups = Just (DigitGroups g _)} -> g /= c | ||||||
|  |       AmountStyle{asprecision = 0} -> False | ||||||
|  |       _ -> True | ||||||
| 
 | 
 | ||||||
|     asdecimalcheck :: AmountStyle -> Char -> Bool |  | ||||||
|     asdecimalcheck = \case |  | ||||||
|         AmountStyle{asdecimalpoint = Just d} -> (d ==) |  | ||||||
|         AmountStyle{asdigitgroups = Just (DigitGroups g _)} -> (g /=) |  | ||||||
|         AmountStyle{asprecision = 0} -> const False |  | ||||||
|         _ -> const True |  | ||||||
| 
 | 
 | ||||||
| -- | Pre-parse a number into parts for further interpretation. | -- | Parse and interpret the structure of a number as far as possible | ||||||
| -- Numbers may optionally have a period/comma decimal point  | -- without external hints. Numbers are digit strings, possibly separated | ||||||
| -- and/or comma/period/space digit group separators, but we don't | -- into digit groups by one of two types of separators. (1) Numbers may | ||||||
| -- decide which is which here, just return the parts: | -- optionally have a decimal point, which may be either a period or comma. | ||||||
|  | -- (2) Numbers may optionally contain digit group separators, which must | ||||||
|  | -- all be either a period, a comma, or a space. | ||||||
| -- | -- | ||||||
| -- - the first separator char (period or comma or space) seen, if any | -- It is our task to deduce the identities of the decimal point and digit | ||||||
| -- | -- separator characters, based on the allowed syntax. For instance, we | ||||||
| -- - the digit group(s), possibly several separated by the above char, occuring before.. | -- make use of the fact that a decimal point can occur at most once and | ||||||
| -- | -- must succeed all digit group separators. | ||||||
| -- - the second and last separator char, and following digit group, if any. |  | ||||||
| -- | -- | ||||||
| -- >>> parseTest rawnumberp "1,234,567.89" | -- >>> parseTest rawnumberp "1,234,567.89" | ||||||
| -- (Just ',',["1","234","567"],Just ('.',"89")) | -- BothSeparators ',' ["1","234","567"] '.' "89" | ||||||
| -- >>> parseTest rawnumberp "1 000" | -- >>> parseTest rawnumberp "1 000" | ||||||
| -- (Just ' ',["1","000"],Nothing) | -- AmbiguousNumber "1" ' ' "000" | ||||||
| rawnumberp :: TextParser m ( Maybe Char , [String] , Maybe (Char, String) ) | -- | ||||||
| rawnumberp = do | rawnumberp :: TextParser m RawNumber | ||||||
|     (firstSep, groups) <- option (Nothing, []) $ do | rawnumberp = label "rawnumberp" $ do | ||||||
|         leadingDigits <- some digitChar |   rawNumber <- leadingDecimalPt <|> leadingDigits | ||||||
|         option (Nothing, [leadingDigits]) . try $ do |  | ||||||
|             firstSep <- oneOf decimalPointChars <|> whitespaceChar |  | ||||||
|             secondGroup <- some digitChar |  | ||||||
|             otherGroups <- many $ try $ char firstSep *> some digitChar |  | ||||||
|             return (Just firstSep, leadingDigits : secondGroup : otherGroups) |  | ||||||
| 
 | 
 | ||||||
|     let remSepChars = maybe decimalPointChars (`delete` decimalPointChars) firstSep |   -- Guard against mistyped numbers | ||||||
|         modifier |   notFollowedBy $ satisfy isDecimalPointChar <|> (char ' ' >> digitChar) | ||||||
|             | null groups = fmap Just  -- if no digits so far, we require at least some decimals |  | ||||||
|             | otherwise = optional |  | ||||||
| 
 | 
 | ||||||
|     extraGroup <- modifier $ do |   return $ dbg8 "rawnumberp" rawNumber | ||||||
|         lastSep <- oneOf remSepChars |  | ||||||
|         digits <- modifier $ some digitChar  -- decimal separator allowed to be without digits if had some before |  | ||||||
|         return (lastSep, fromMaybe [] digits) |  | ||||||
| 
 | 
 | ||||||
|     -- make sure we didn't leading part of mistyped number |   where | ||||||
|     notFollowedBy $ oneOf decimalPointChars <|> (whitespaceChar >> digitChar) |  | ||||||
| 
 | 
 | ||||||
|     return $ dbg8 "rawnumberp" (firstSep, groups, extraGroup) |   leadingDecimalPt :: TextParser m RawNumber | ||||||
|     <?> "rawnumberp" |   leadingDecimalPt = | ||||||
|  |     LeadingDecimalPt <$> satisfy isDecimalPointChar <*> pdigitgroup | ||||||
| 
 | 
 | ||||||
| decimalPointChars :: String |   leadingDigits :: TextParser m RawNumber | ||||||
| decimalPointChars = ".," |   leadingDigits = do | ||||||
|  |     grp1 <- pdigitgroup | ||||||
|  |     withSeparators grp1 <|> trailingDecimalPt grp1 <|> pure (NoSeparators grp1) | ||||||
| 
 | 
 | ||||||
| -- | Parse a unicode char that represents any non-control space char (Zs general category). |   withSeparators :: DigitGrp -> TextParser m RawNumber | ||||||
| whitespaceChar :: TextParser m Char |   withSeparators grp1 = do | ||||||
| whitespaceChar = charCategory Space |     (sep, grp2) <- try $ (,) <$> satisfy isDigitSeparatorChar <*> pdigitgroup | ||||||
|  |     grps <- many $ try $ char sep *> pdigitgroup | ||||||
|  | 
 | ||||||
|  |     let digitGroups = grp1 : grp2 : grps | ||||||
|  |     withDecimalPt sep digitGroups <|> pure (withoutDecimalPt grp1 sep grp2 grps) | ||||||
|  | 
 | ||||||
|  |   withDecimalPt :: Char -> [DigitGrp] -> TextParser m RawNumber | ||||||
|  |   withDecimalPt digitSep digitGroups = do | ||||||
|  |     decimalPt <- satisfy $ \c -> isDecimalPointChar c && c /= digitSep | ||||||
|  |     decimalDigitGroup <- option mempty pdigitgroup | ||||||
|  | 
 | ||||||
|  |     pure $ BothSeparators digitSep digitGroups decimalPt decimalDigitGroup | ||||||
|  | 
 | ||||||
|  |   withoutDecimalPt :: DigitGrp -> Char -> DigitGrp -> [DigitGrp] -> RawNumber | ||||||
|  |   withoutDecimalPt grp1 sep grp2 grps | ||||||
|  |     | null grps = AmbiguousNumber grp1 sep grp2 | ||||||
|  |     | otherwise = DigitSeparators sep (grp1:grp2:grps) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |   trailingDecimalPt :: DigitGrp -> TextParser m RawNumber | ||||||
|  |   trailingDecimalPt grp1 = do | ||||||
|  |     decimalPt <- satisfy isDecimalPointChar | ||||||
|  |     pure $ TrailingDecimalPt grp1 decimalPt | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | isDecimalPointChar :: Char -> Bool | ||||||
|  | isDecimalPointChar c = c == '.' || c == ',' | ||||||
|  | 
 | ||||||
|  | isDigitSeparatorChar :: Char -> Bool | ||||||
|  | isDigitSeparatorChar c = isDecimalPointChar c || c == ' ' | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | data DigitGrp = DigitGrp { | ||||||
|  |   digitGroupLength :: Int, | ||||||
|  |   digitGroupNumber :: Integer | ||||||
|  | } deriving (Eq) | ||||||
|  | 
 | ||||||
|  | instance Show DigitGrp where | ||||||
|  |   show (DigitGrp len num) | ||||||
|  |     | len > 0 = "\"" ++ padding ++ numStr ++ "\"" | ||||||
|  |     | otherwise = "\"\"" | ||||||
|  |     where numStr = show num | ||||||
|  |           padding = replicate (len - length numStr) '0' | ||||||
|  | 
 | ||||||
|  | instance Sem.Semigroup DigitGrp where | ||||||
|  |   DigitGrp l1 n1 <> DigitGrp l2 n2 = DigitGrp (l1 + l2) (n1 * 10^l2 + n2) | ||||||
|  | 
 | ||||||
|  | instance Monoid DigitGrp where | ||||||
|  |   mempty = DigitGrp 0 0 | ||||||
|  |   mappend = (Sem.<>) | ||||||
|  | 
 | ||||||
|  | pdigitgroup :: TextParser m DigitGrp | ||||||
|  | pdigitgroup = label "digit group" | ||||||
|  |             $ makeGroup <$> takeWhile1P (Just "digit") isDigit | ||||||
|  |   where | ||||||
|  |     makeGroup = uncurry DigitGrp . foldl' step (0, 0) . T.unpack | ||||||
|  |     step (!l, !a) c = (l+1, a*10 + fromIntegral (digitToInt c)) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | data RawNumber | ||||||
|  |   = LeadingDecimalPt  Char DigitGrp                 -- .50 | ||||||
|  |   | TrailingDecimalPt DigitGrp Char                 -- 100. | ||||||
|  |   | NoSeparators      DigitGrp                      -- 100 | ||||||
|  |   | AmbiguousNumber   DigitGrp Char DigitGrp        -- 1,000 | ||||||
|  |   | DigitSeparators   Char [DigitGrp]               -- 1,000,000 | ||||||
|  |   | BothSeparators    Char [DigitGrp] Char DigitGrp -- 1,000.50 | ||||||
|  |   deriving (Show, Eq) | ||||||
| 
 | 
 | ||||||
| -- test_numberp = do | -- test_numberp = do | ||||||
| --       let s `is` n = assertParseEqual (parseWithState mempty numberp s) n | --       let s `is` n = assertParseEqual (parseWithState mempty numberp s) n | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user