lib: refactor the raw number parser [API change]
This commit is contained in:
		
							parent
							
								
									93fbac99d3
								
							
						
					
					
						commit
						121ba92ade
					
				| @ -101,12 +101,17 @@ import Control.Monad.State.Strict | ||||
| import Data.Bifunctor | ||||
| import Data.Char | ||||
| import Data.Data | ||||
| import Data.Decimal (DecimalRaw (Decimal), Decimal) | ||||
| import Data.Default | ||||
| import Data.Functor.Identity | ||||
| import Data.List.Compat | ||||
| import Data.List.NonEmpty (NonEmpty(..)) | ||||
| import Data.Maybe | ||||
| #if !(MIN_VERSION_base(4,11,0)) | ||||
| import Data.Monoid | ||||
| #endif | ||||
| import qualified Data.Map as M | ||||
| import qualified Data.Semigroup as Sem | ||||
| import Data.Text (Text) | ||||
| import qualified Data.Text as T | ||||
| import Data.Time.Calendar | ||||
| @ -709,93 +714,201 @@ exponentp = do | ||||
|     return $ bimap (* 10^^exp) (max 0 . subtract exp) | ||||
|     <?> "exponentp" | ||||
| 
 | ||||
| -- | Interpret the raw parts of a number, using the provided amount style if any, | ||||
| -- determining the decimal point character and digit groups where possible. | ||||
| -- | Interpret a raw number as a decimal number, and identify the decimal | ||||
| -- point charcter and digit separating scheme. There is only one ambiguous | ||||
| -- case: when there is just a single separator between two digit groups. | ||||
| -- Disambiguate using an amount style, if provided; otherwise, assume that | ||||
| -- the separator is a decimal point. | ||||
| -- | ||||
| -- Returns: | ||||
| -- - the decimal number | ||||
| -- - the precision (number of digits after the decimal point)   | ||||
| -- - the decimal point character, if any | ||||
| -- - the digit group style, if any (digit group character and sizes of digit groups) | ||||
| fromRawNumber :: Maybe AmountStyle -> Bool -> (Maybe Char, [String], Maybe (Char, String)) -> (Quantity, Int, Maybe Char, Maybe DigitGroupStyle) | ||||
| fromRawNumber suggestedStyle negated raw = (quantity, precision, mdecimalpoint, mgrps) where | ||||
|     -- unpack with a hint if useful | ||||
|     (mseparator, intparts, mdecimalpoint, frac) = | ||||
|             case raw of | ||||
|                 -- If the number consists of exactly two digit groups | ||||
|                 -- separated by a valid decimal point character, we assume | ||||
|                 -- that the character represents a decimal point. | ||||
|                 (Just s, [firstGroup, lastGroup], Nothing) | ||||
|                     | s `elem` decimalPointChars && maybe True (`asdecimalcheck` s) suggestedStyle -> | ||||
|                         (Nothing, [firstGroup], Just s, lastGroup) | ||||
| fromRawNumber | ||||
|   :: Maybe AmountStyle | ||||
|   -> Bool | ||||
|   -> RawNumber | ||||
|   -> (Quantity, Int, Maybe Char, Maybe DigitGroupStyle) | ||||
| fromRawNumber suggestedStyle negated raw = case raw of | ||||
| 
 | ||||
|                 (firstSep, digitGroups, Nothing) -> (firstSep, digitGroups, Nothing, []) | ||||
|                 (firstSep, digitGroups, Just (d, frac)) -> (firstSep, digitGroups, Just d, frac) | ||||
|   LeadingDecimalPt decPt digitGrp -> | ||||
|     let quantity = sign $ Decimal (fromIntegral precision) | ||||
|                                   (digitGroupNumber digitGrp) | ||||
|         precision = digitGroupLength digitGrp | ||||
|     in  (quantity, precision, Just decPt, Nothing) | ||||
| 
 | ||||
|     -- get the digit group sizes and digit group style if any | ||||
|     groupsizes = reverse $ case map length intparts of | ||||
|                                (a:b:cs) | a < b -> b:cs | ||||
|                                gs               -> gs | ||||
|     mgrps = (`DigitGroups` groupsizes) <$> mseparator | ||||
|   TrailingDecimalPt digitGrp decPt -> | ||||
|     let quantity = sign $ Decimal (fromIntegral precision) | ||||
|                                   (digitGroupNumber digitGrp) | ||||
|         precision = 0 | ||||
|     in  (quantity, precision, Just decPt, Nothing) | ||||
| 
 | ||||
|     -- put the parts back together without digit group separators, get the precision and parse the value | ||||
|     repr = (if negated then "-" else "") ++ "0" ++ concat intparts ++ (if null frac then "" else "." ++ frac) | ||||
|     quantity = read repr | ||||
|     precision = length frac | ||||
|   NoSeparators digitGrp -> | ||||
|     let quantity = sign $ Decimal (fromIntegral precision) | ||||
|                                   (digitGroupNumber digitGrp) | ||||
|         precision = 0 | ||||
|     in  (quantity, precision, Nothing, Nothing) | ||||
| 
 | ||||
|     asdecimalcheck :: AmountStyle -> Char -> Bool | ||||
|     asdecimalcheck = \case | ||||
|         AmountStyle{asdecimalpoint = Just d} -> (d ==) | ||||
|         AmountStyle{asdigitgroups = Just (DigitGroups g _)} -> (g /=) | ||||
|         AmountStyle{asprecision = 0} -> const False | ||||
|         _ -> const True | ||||
|   AmbiguousNumber digitGrp1 sep digitGrp2 -> | ||||
|     -- If present, use the suggested style to disambiguate; | ||||
|     -- otherwise, assume that the separator is a decimal point where possible. | ||||
|     if isDecimalPointChar sep && maybe True (sep `isValidDecimalBy`) suggestedStyle | ||||
| 
 | ||||
| -- | Pre-parse a number into parts for further interpretation. | ||||
| -- Numbers may optionally have a period/comma decimal point  | ||||
| -- and/or comma/period/space digit group separators, but we don't | ||||
| -- decide which is which here, just return the parts: | ||||
|     then -- Assuming that the separator is a decimal point | ||||
|       let quantity = sign $ Decimal (fromIntegral precision) | ||||
|                                     (digitGroupNumber $ digitGrp1 <> digitGrp2) | ||||
|           precision = digitGroupLength digitGrp2 | ||||
|       in  (quantity, precision, Just sep, Nothing) | ||||
| 
 | ||||
|     else -- Assuming that the separator is digit separator | ||||
|       let quantity = sign $ Decimal (fromIntegral precision) | ||||
|                                     (digitGroupNumber $ digitGrp1 <> digitGrp2) | ||||
|           precision = 0 | ||||
|           digitGroupStyle = Just $ | ||||
|             DigitGroups sep (groupSizes $ [digitGrp1, digitGrp2]) | ||||
|       in  (quantity, precision, Nothing, digitGroupStyle) | ||||
| 
 | ||||
|   DigitSeparators digitSep digitGrps -> | ||||
|     let quantity = sign $ Decimal (fromIntegral precision) | ||||
|                                   (digitGroupNumber $ mconcat digitGrps) | ||||
|         precision = 0 | ||||
|         digitGroupStyle = Just $ DigitGroups digitSep (groupSizes digitGrps) | ||||
|     in  (quantity, precision, Nothing, digitGroupStyle) | ||||
| 
 | ||||
|   BothSeparators digitSep digitGrps decPt decimalGrp -> | ||||
|     let quantity = | ||||
|           sign $ Decimal (fromIntegral precision) | ||||
|                          (digitGroupNumber $ mconcat digitGrps <> decimalGrp) | ||||
|         precision = digitGroupLength decimalGrp | ||||
|         digitGroupStyle = Just $ DigitGroups digitSep (groupSizes digitGrps) | ||||
|     in  (quantity, precision, Just decPt, digitGroupStyle) | ||||
| 
 | ||||
|   where | ||||
| 
 | ||||
|     sign :: Decimal -> Decimal | ||||
|     sign = if negated then negate else id | ||||
| 
 | ||||
|     -- Outputs digit group sizes from least significant to most significant | ||||
|     groupSizes :: [DigitGrp] -> [Int] | ||||
|     groupSizes digitGrps = reverse $ case map digitGroupLength digitGrps of | ||||
|       (a:b:cs) | a < b -> b:cs | ||||
|       gs               -> gs | ||||
| 
 | ||||
|     isValidDecimalBy :: Char -> AmountStyle -> Bool | ||||
|     isValidDecimalBy c = \case | ||||
|       AmountStyle{asdecimalpoint = Just d} -> d == c | ||||
|       AmountStyle{asdigitgroups = Just (DigitGroups g _)} -> g /= c | ||||
|       AmountStyle{asprecision = 0} -> False | ||||
|       _ -> True | ||||
| 
 | ||||
| 
 | ||||
| -- | Parse and interpret the structure of a number as far as possible | ||||
| -- without external hints. Numbers are digit strings, possibly separated | ||||
| -- into digit groups by one of two types of separators. (1) Numbers may | ||||
| -- optionally have a decimal point, which may be either a period or comma. | ||||
| -- (2) Numbers may optionally contain digit group separators, which must | ||||
| -- all be either a period, a comma, or a space. | ||||
| -- | ||||
| -- - the first separator char (period or comma or space) seen, if any | ||||
| -- | ||||
| -- - the digit group(s), possibly several separated by the above char, occuring before.. | ||||
| -- | ||||
| -- - the second and last separator char, and following digit group, if any. | ||||
| -- It is our task to deduce the identities of the decimal point and digit | ||||
| -- separator characters, based on the allowed syntax. For instance, we | ||||
| -- make use of the fact that a decimal point can occur at most once and | ||||
| -- must succeed all digit group separators. | ||||
| -- | ||||
| -- >>> parseTest rawnumberp "1,234,567.89" | ||||
| -- (Just ',',["1","234","567"],Just ('.',"89")) | ||||
| -- BothSeparators ',' ["1","234","567"] '.' "89" | ||||
| -- >>> parseTest rawnumberp "1 000" | ||||
| -- (Just ' ',["1","000"],Nothing) | ||||
| rawnumberp :: TextParser m ( Maybe Char , [String] , Maybe (Char, String) ) | ||||
| rawnumberp = do | ||||
|     (firstSep, groups) <- option (Nothing, []) $ do | ||||
|         leadingDigits <- some digitChar | ||||
|         option (Nothing, [leadingDigits]) . try $ do | ||||
|             firstSep <- oneOf decimalPointChars <|> whitespaceChar | ||||
|             secondGroup <- some digitChar | ||||
|             otherGroups <- many $ try $ char firstSep *> some digitChar | ||||
|             return (Just firstSep, leadingDigits : secondGroup : otherGroups) | ||||
| -- AmbiguousNumber "1" ' ' "000" | ||||
| -- | ||||
| rawnumberp :: TextParser m RawNumber | ||||
| rawnumberp = label "rawnumberp" $ do | ||||
|   rawNumber <- leadingDecimalPt <|> leadingDigits | ||||
| 
 | ||||
|     let remSepChars = maybe decimalPointChars (`delete` decimalPointChars) firstSep | ||||
|         modifier | ||||
|             | null groups = fmap Just  -- if no digits so far, we require at least some decimals | ||||
|             | otherwise = optional | ||||
|   -- Guard against mistyped numbers | ||||
|   notFollowedBy $ satisfy isDecimalPointChar <|> (char ' ' >> digitChar) | ||||
| 
 | ||||
|     extraGroup <- modifier $ do | ||||
|         lastSep <- oneOf remSepChars | ||||
|         digits <- modifier $ some digitChar  -- decimal separator allowed to be without digits if had some before | ||||
|         return (lastSep, fromMaybe [] digits) | ||||
|   return $ dbg8 "rawnumberp" rawNumber | ||||
| 
 | ||||
|     -- make sure we didn't leading part of mistyped number | ||||
|     notFollowedBy $ oneOf decimalPointChars <|> (whitespaceChar >> digitChar) | ||||
|   where | ||||
| 
 | ||||
|     return $ dbg8 "rawnumberp" (firstSep, groups, extraGroup) | ||||
|     <?> "rawnumberp" | ||||
|   leadingDecimalPt :: TextParser m RawNumber | ||||
|   leadingDecimalPt = | ||||
|     LeadingDecimalPt <$> satisfy isDecimalPointChar <*> pdigitgroup | ||||
| 
 | ||||
| decimalPointChars :: String | ||||
| decimalPointChars = ".," | ||||
|   leadingDigits :: TextParser m RawNumber | ||||
|   leadingDigits = do | ||||
|     grp1 <- pdigitgroup | ||||
|     withSeparators grp1 <|> trailingDecimalPt grp1 <|> pure (NoSeparators grp1) | ||||
| 
 | ||||
| -- | Parse a unicode char that represents any non-control space char (Zs general category). | ||||
| whitespaceChar :: TextParser m Char | ||||
| whitespaceChar = charCategory Space | ||||
|   withSeparators :: DigitGrp -> TextParser m RawNumber | ||||
|   withSeparators grp1 = do | ||||
|     (sep, grp2) <- try $ (,) <$> satisfy isDigitSeparatorChar <*> pdigitgroup | ||||
|     grps <- many $ try $ char sep *> pdigitgroup | ||||
| 
 | ||||
|     let digitGroups = grp1 : grp2 : grps | ||||
|     withDecimalPt sep digitGroups <|> pure (withoutDecimalPt grp1 sep grp2 grps) | ||||
| 
 | ||||
|   withDecimalPt :: Char -> [DigitGrp] -> TextParser m RawNumber | ||||
|   withDecimalPt digitSep digitGroups = do | ||||
|     decimalPt <- satisfy $ \c -> isDecimalPointChar c && c /= digitSep | ||||
|     decimalDigitGroup <- option mempty pdigitgroup | ||||
| 
 | ||||
|     pure $ BothSeparators digitSep digitGroups decimalPt decimalDigitGroup | ||||
| 
 | ||||
|   withoutDecimalPt :: DigitGrp -> Char -> DigitGrp -> [DigitGrp] -> RawNumber | ||||
|   withoutDecimalPt grp1 sep grp2 grps | ||||
|     | null grps = AmbiguousNumber grp1 sep grp2 | ||||
|     | otherwise = DigitSeparators sep (grp1:grp2:grps) | ||||
| 
 | ||||
| 
 | ||||
|   trailingDecimalPt :: DigitGrp -> TextParser m RawNumber | ||||
|   trailingDecimalPt grp1 = do | ||||
|     decimalPt <- satisfy isDecimalPointChar | ||||
|     pure $ TrailingDecimalPt grp1 decimalPt | ||||
| 
 | ||||
| 
 | ||||
| isDecimalPointChar :: Char -> Bool | ||||
| isDecimalPointChar c = c == '.' || c == ',' | ||||
| 
 | ||||
| isDigitSeparatorChar :: Char -> Bool | ||||
| isDigitSeparatorChar c = isDecimalPointChar c || c == ' ' | ||||
| 
 | ||||
| 
 | ||||
| data DigitGrp = DigitGrp { | ||||
|   digitGroupLength :: Int, | ||||
|   digitGroupNumber :: Integer | ||||
| } deriving (Eq) | ||||
| 
 | ||||
| instance Show DigitGrp where | ||||
|   show (DigitGrp len num) | ||||
|     | len > 0 = "\"" ++ padding ++ numStr ++ "\"" | ||||
|     | otherwise = "\"\"" | ||||
|     where numStr = show num | ||||
|           padding = replicate (len - length numStr) '0' | ||||
| 
 | ||||
| instance Sem.Semigroup DigitGrp where | ||||
|   DigitGrp l1 n1 <> DigitGrp l2 n2 = DigitGrp (l1 + l2) (n1 * 10^l2 + n2) | ||||
| 
 | ||||
| instance Monoid DigitGrp where | ||||
|   mempty = DigitGrp 0 0 | ||||
|   mappend = (Sem.<>) | ||||
| 
 | ||||
| pdigitgroup :: TextParser m DigitGrp | ||||
| pdigitgroup = label "digit group" | ||||
|             $ makeGroup <$> takeWhile1P (Just "digit") isDigit | ||||
|   where | ||||
|     makeGroup = uncurry DigitGrp . foldl' step (0, 0) . T.unpack | ||||
|     step (!l, !a) c = (l+1, a*10 + fromIntegral (digitToInt c)) | ||||
| 
 | ||||
| 
 | ||||
| data RawNumber | ||||
|   = LeadingDecimalPt  Char DigitGrp                 -- .50 | ||||
|   | TrailingDecimalPt DigitGrp Char                 -- 100. | ||||
|   | NoSeparators      DigitGrp                      -- 100 | ||||
|   | AmbiguousNumber   DigitGrp Char DigitGrp        -- 1,000 | ||||
|   | DigitSeparators   Char [DigitGrp]               -- 1,000,000 | ||||
|   | BothSeparators    Char [DigitGrp] Char DigitGrp -- 1,000.50 | ||||
|   deriving (Show, Eq) | ||||
| 
 | ||||
| -- test_numberp = do | ||||
| --       let s `is` n = assertParseEqual (parseWithState mempty numberp s) n | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user