lib: refactor the raw number parser [API change]

This commit is contained in:
Alex Chen 2018-05-23 13:45:57 -06:00 committed by Simon Michael
parent 93fbac99d3
commit 121ba92ade

View File

@ -101,12 +101,17 @@ import Control.Monad.State.Strict
import Data.Bifunctor import Data.Bifunctor
import Data.Char import Data.Char
import Data.Data import Data.Data
import Data.Decimal (DecimalRaw (Decimal), Decimal)
import Data.Default import Data.Default
import Data.Functor.Identity import Data.Functor.Identity
import Data.List.Compat import Data.List.Compat
import Data.List.NonEmpty (NonEmpty(..)) import Data.List.NonEmpty (NonEmpty(..))
import Data.Maybe import Data.Maybe
#if !(MIN_VERSION_base(4,11,0))
import Data.Monoid
#endif
import qualified Data.Map as M import qualified Data.Map as M
import qualified Data.Semigroup as Sem
import Data.Text (Text) import Data.Text (Text)
import qualified Data.Text as T import qualified Data.Text as T
import Data.Time.Calendar import Data.Time.Calendar
@ -709,93 +714,201 @@ exponentp = do
return $ bimap (* 10^^exp) (max 0 . subtract exp) return $ bimap (* 10^^exp) (max 0 . subtract exp)
<?> "exponentp" <?> "exponentp"
-- | Interpret the raw parts of a number, using the provided amount style if any, -- | Interpret a raw number as a decimal number, and identify the decimal
-- determining the decimal point character and digit groups where possible. -- point charcter and digit separating scheme. There is only one ambiguous
-- case: when there is just a single separator between two digit groups.
-- Disambiguate using an amount style, if provided; otherwise, assume that
-- the separator is a decimal point.
--
-- Returns: -- Returns:
-- - the decimal number -- - the decimal number
-- - the precision (number of digits after the decimal point) -- - the precision (number of digits after the decimal point)
-- - the decimal point character, if any -- - the decimal point character, if any
-- - the digit group style, if any (digit group character and sizes of digit groups) -- - the digit group style, if any (digit group character and sizes of digit groups)
fromRawNumber :: Maybe AmountStyle -> Bool -> (Maybe Char, [String], Maybe (Char, String)) -> (Quantity, Int, Maybe Char, Maybe DigitGroupStyle) fromRawNumber
fromRawNumber suggestedStyle negated raw = (quantity, precision, mdecimalpoint, mgrps) where :: Maybe AmountStyle
-- unpack with a hint if useful -> Bool
(mseparator, intparts, mdecimalpoint, frac) = -> RawNumber
case raw of -> (Quantity, Int, Maybe Char, Maybe DigitGroupStyle)
-- If the number consists of exactly two digit groups fromRawNumber suggestedStyle negated raw = case raw of
-- separated by a valid decimal point character, we assume
-- that the character represents a decimal point.
(Just s, [firstGroup, lastGroup], Nothing)
| s `elem` decimalPointChars && maybe True (`asdecimalcheck` s) suggestedStyle ->
(Nothing, [firstGroup], Just s, lastGroup)
(firstSep, digitGroups, Nothing) -> (firstSep, digitGroups, Nothing, []) LeadingDecimalPt decPt digitGrp ->
(firstSep, digitGroups, Just (d, frac)) -> (firstSep, digitGroups, Just d, frac) let quantity = sign $ Decimal (fromIntegral precision)
(digitGroupNumber digitGrp)
precision = digitGroupLength digitGrp
in (quantity, precision, Just decPt, Nothing)
-- get the digit group sizes and digit group style if any TrailingDecimalPt digitGrp decPt ->
groupsizes = reverse $ case map length intparts of let quantity = sign $ Decimal (fromIntegral precision)
(digitGroupNumber digitGrp)
precision = 0
in (quantity, precision, Just decPt, Nothing)
NoSeparators digitGrp ->
let quantity = sign $ Decimal (fromIntegral precision)
(digitGroupNumber digitGrp)
precision = 0
in (quantity, precision, Nothing, Nothing)
AmbiguousNumber digitGrp1 sep digitGrp2 ->
-- If present, use the suggested style to disambiguate;
-- otherwise, assume that the separator is a decimal point where possible.
if isDecimalPointChar sep && maybe True (sep `isValidDecimalBy`) suggestedStyle
then -- Assuming that the separator is a decimal point
let quantity = sign $ Decimal (fromIntegral precision)
(digitGroupNumber $ digitGrp1 <> digitGrp2)
precision = digitGroupLength digitGrp2
in (quantity, precision, Just sep, Nothing)
else -- Assuming that the separator is digit separator
let quantity = sign $ Decimal (fromIntegral precision)
(digitGroupNumber $ digitGrp1 <> digitGrp2)
precision = 0
digitGroupStyle = Just $
DigitGroups sep (groupSizes $ [digitGrp1, digitGrp2])
in (quantity, precision, Nothing, digitGroupStyle)
DigitSeparators digitSep digitGrps ->
let quantity = sign $ Decimal (fromIntegral precision)
(digitGroupNumber $ mconcat digitGrps)
precision = 0
digitGroupStyle = Just $ DigitGroups digitSep (groupSizes digitGrps)
in (quantity, precision, Nothing, digitGroupStyle)
BothSeparators digitSep digitGrps decPt decimalGrp ->
let quantity =
sign $ Decimal (fromIntegral precision)
(digitGroupNumber $ mconcat digitGrps <> decimalGrp)
precision = digitGroupLength decimalGrp
digitGroupStyle = Just $ DigitGroups digitSep (groupSizes digitGrps)
in (quantity, precision, Just decPt, digitGroupStyle)
where
sign :: Decimal -> Decimal
sign = if negated then negate else id
-- Outputs digit group sizes from least significant to most significant
groupSizes :: [DigitGrp] -> [Int]
groupSizes digitGrps = reverse $ case map digitGroupLength digitGrps of
(a:b:cs) | a < b -> b:cs (a:b:cs) | a < b -> b:cs
gs -> gs gs -> gs
mgrps = (`DigitGroups` groupsizes) <$> mseparator
-- put the parts back together without digit group separators, get the precision and parse the value isValidDecimalBy :: Char -> AmountStyle -> Bool
repr = (if negated then "-" else "") ++ "0" ++ concat intparts ++ (if null frac then "" else "." ++ frac) isValidDecimalBy c = \case
quantity = read repr AmountStyle{asdecimalpoint = Just d} -> d == c
precision = length frac AmountStyle{asdigitgroups = Just (DigitGroups g _)} -> g /= c
AmountStyle{asprecision = 0} -> False
_ -> True
asdecimalcheck :: AmountStyle -> Char -> Bool
asdecimalcheck = \case
AmountStyle{asdecimalpoint = Just d} -> (d ==)
AmountStyle{asdigitgroups = Just (DigitGroups g _)} -> (g /=)
AmountStyle{asprecision = 0} -> const False
_ -> const True
-- | Pre-parse a number into parts for further interpretation. -- | Parse and interpret the structure of a number as far as possible
-- Numbers may optionally have a period/comma decimal point -- without external hints. Numbers are digit strings, possibly separated
-- and/or comma/period/space digit group separators, but we don't -- into digit groups by one of two types of separators. (1) Numbers may
-- decide which is which here, just return the parts: -- optionally have a decimal point, which may be either a period or comma.
-- (2) Numbers may optionally contain digit group separators, which must
-- all be either a period, a comma, or a space.
-- --
-- - the first separator char (period or comma or space) seen, if any -- It is our task to deduce the identities of the decimal point and digit
-- -- separator characters, based on the allowed syntax. For instance, we
-- - the digit group(s), possibly several separated by the above char, occuring before.. -- make use of the fact that a decimal point can occur at most once and
-- -- must succeed all digit group separators.
-- - the second and last separator char, and following digit group, if any.
-- --
-- >>> parseTest rawnumberp "1,234,567.89" -- >>> parseTest rawnumberp "1,234,567.89"
-- (Just ',',["1","234","567"],Just ('.',"89")) -- BothSeparators ',' ["1","234","567"] '.' "89"
-- >>> parseTest rawnumberp "1 000" -- >>> parseTest rawnumberp "1 000"
-- (Just ' ',["1","000"],Nothing) -- AmbiguousNumber "1" ' ' "000"
rawnumberp :: TextParser m ( Maybe Char , [String] , Maybe (Char, String) ) --
rawnumberp = do rawnumberp :: TextParser m RawNumber
(firstSep, groups) <- option (Nothing, []) $ do rawnumberp = label "rawnumberp" $ do
leadingDigits <- some digitChar rawNumber <- leadingDecimalPt <|> leadingDigits
option (Nothing, [leadingDigits]) . try $ do
firstSep <- oneOf decimalPointChars <|> whitespaceChar
secondGroup <- some digitChar
otherGroups <- many $ try $ char firstSep *> some digitChar
return (Just firstSep, leadingDigits : secondGroup : otherGroups)
let remSepChars = maybe decimalPointChars (`delete` decimalPointChars) firstSep -- Guard against mistyped numbers
modifier notFollowedBy $ satisfy isDecimalPointChar <|> (char ' ' >> digitChar)
| null groups = fmap Just -- if no digits so far, we require at least some decimals
| otherwise = optional
extraGroup <- modifier $ do return $ dbg8 "rawnumberp" rawNumber
lastSep <- oneOf remSepChars
digits <- modifier $ some digitChar -- decimal separator allowed to be without digits if had some before
return (lastSep, fromMaybe [] digits)
-- make sure we didn't leading part of mistyped number where
notFollowedBy $ oneOf decimalPointChars <|> (whitespaceChar >> digitChar)
return $ dbg8 "rawnumberp" (firstSep, groups, extraGroup) leadingDecimalPt :: TextParser m RawNumber
<?> "rawnumberp" leadingDecimalPt =
LeadingDecimalPt <$> satisfy isDecimalPointChar <*> pdigitgroup
decimalPointChars :: String leadingDigits :: TextParser m RawNumber
decimalPointChars = ".," leadingDigits = do
grp1 <- pdigitgroup
withSeparators grp1 <|> trailingDecimalPt grp1 <|> pure (NoSeparators grp1)
-- | Parse a unicode char that represents any non-control space char (Zs general category). withSeparators :: DigitGrp -> TextParser m RawNumber
whitespaceChar :: TextParser m Char withSeparators grp1 = do
whitespaceChar = charCategory Space (sep, grp2) <- try $ (,) <$> satisfy isDigitSeparatorChar <*> pdigitgroup
grps <- many $ try $ char sep *> pdigitgroup
let digitGroups = grp1 : grp2 : grps
withDecimalPt sep digitGroups <|> pure (withoutDecimalPt grp1 sep grp2 grps)
withDecimalPt :: Char -> [DigitGrp] -> TextParser m RawNumber
withDecimalPt digitSep digitGroups = do
decimalPt <- satisfy $ \c -> isDecimalPointChar c && c /= digitSep
decimalDigitGroup <- option mempty pdigitgroup
pure $ BothSeparators digitSep digitGroups decimalPt decimalDigitGroup
withoutDecimalPt :: DigitGrp -> Char -> DigitGrp -> [DigitGrp] -> RawNumber
withoutDecimalPt grp1 sep grp2 grps
| null grps = AmbiguousNumber grp1 sep grp2
| otherwise = DigitSeparators sep (grp1:grp2:grps)
trailingDecimalPt :: DigitGrp -> TextParser m RawNumber
trailingDecimalPt grp1 = do
decimalPt <- satisfy isDecimalPointChar
pure $ TrailingDecimalPt grp1 decimalPt
isDecimalPointChar :: Char -> Bool
isDecimalPointChar c = c == '.' || c == ','
isDigitSeparatorChar :: Char -> Bool
isDigitSeparatorChar c = isDecimalPointChar c || c == ' '
data DigitGrp = DigitGrp {
digitGroupLength :: Int,
digitGroupNumber :: Integer
} deriving (Eq)
instance Show DigitGrp where
show (DigitGrp len num)
| len > 0 = "\"" ++ padding ++ numStr ++ "\""
| otherwise = "\"\""
where numStr = show num
padding = replicate (len - length numStr) '0'
instance Sem.Semigroup DigitGrp where
DigitGrp l1 n1 <> DigitGrp l2 n2 = DigitGrp (l1 + l2) (n1 * 10^l2 + n2)
instance Monoid DigitGrp where
mempty = DigitGrp 0 0
mappend = (Sem.<>)
pdigitgroup :: TextParser m DigitGrp
pdigitgroup = label "digit group"
$ makeGroup <$> takeWhile1P (Just "digit") isDigit
where
makeGroup = uncurry DigitGrp . foldl' step (0, 0) . T.unpack
step (!l, !a) c = (l+1, a*10 + fromIntegral (digitToInt c))
data RawNumber
= LeadingDecimalPt Char DigitGrp -- .50
| TrailingDecimalPt DigitGrp Char -- 100.
| NoSeparators DigitGrp -- 100
| AmbiguousNumber DigitGrp Char DigitGrp -- 1,000
| DigitSeparators Char [DigitGrp] -- 1,000,000
| BothSeparators Char [DigitGrp] Char DigitGrp -- 1,000.50
deriving (Show, Eq)
-- test_numberp = do -- test_numberp = do
-- let s `is` n = assertParseEqual (parseWithState mempty numberp s) n -- let s `is` n = assertParseEqual (parseWithState mempty numberp s) n