lib: refactor the raw number parser [API change]

This commit is contained in:
Alex Chen 2018-05-23 13:45:57 -06:00 committed by Simon Michael
parent 93fbac99d3
commit 121ba92ade

View File

@ -101,12 +101,17 @@ import Control.Monad.State.Strict
import Data.Bifunctor
import Data.Char
import Data.Data
import Data.Decimal (DecimalRaw (Decimal), Decimal)
import Data.Default
import Data.Functor.Identity
import Data.List.Compat
import Data.List.NonEmpty (NonEmpty(..))
import Data.Maybe
#if !(MIN_VERSION_base(4,11,0))
import Data.Monoid
#endif
import qualified Data.Map as M
import qualified Data.Semigroup as Sem
import Data.Text (Text)
import qualified Data.Text as T
import Data.Time.Calendar
@ -709,93 +714,201 @@ exponentp = do
return $ bimap (* 10^^exp) (max 0 . subtract exp)
<?> "exponentp"
-- | Interpret the raw parts of a number, using the provided amount style if any,
-- determining the decimal point character and digit groups where possible.
-- | Interpret a raw number as a decimal number, and identify the decimal
-- point charcter and digit separating scheme. There is only one ambiguous
-- case: when there is just a single separator between two digit groups.
-- Disambiguate using an amount style, if provided; otherwise, assume that
-- the separator is a decimal point.
--
-- Returns:
-- - the decimal number
-- - the precision (number of digits after the decimal point)
-- - the decimal point character, if any
-- - the digit group style, if any (digit group character and sizes of digit groups)
fromRawNumber :: Maybe AmountStyle -> Bool -> (Maybe Char, [String], Maybe (Char, String)) -> (Quantity, Int, Maybe Char, Maybe DigitGroupStyle)
fromRawNumber suggestedStyle negated raw = (quantity, precision, mdecimalpoint, mgrps) where
-- unpack with a hint if useful
(mseparator, intparts, mdecimalpoint, frac) =
case raw of
-- If the number consists of exactly two digit groups
-- separated by a valid decimal point character, we assume
-- that the character represents a decimal point.
(Just s, [firstGroup, lastGroup], Nothing)
| s `elem` decimalPointChars && maybe True (`asdecimalcheck` s) suggestedStyle ->
(Nothing, [firstGroup], Just s, lastGroup)
fromRawNumber
:: Maybe AmountStyle
-> Bool
-> RawNumber
-> (Quantity, Int, Maybe Char, Maybe DigitGroupStyle)
fromRawNumber suggestedStyle negated raw = case raw of
(firstSep, digitGroups, Nothing) -> (firstSep, digitGroups, Nothing, [])
(firstSep, digitGroups, Just (d, frac)) -> (firstSep, digitGroups, Just d, frac)
LeadingDecimalPt decPt digitGrp ->
let quantity = sign $ Decimal (fromIntegral precision)
(digitGroupNumber digitGrp)
precision = digitGroupLength digitGrp
in (quantity, precision, Just decPt, Nothing)
-- get the digit group sizes and digit group style if any
groupsizes = reverse $ case map length intparts of
TrailingDecimalPt digitGrp decPt ->
let quantity = sign $ Decimal (fromIntegral precision)
(digitGroupNumber digitGrp)
precision = 0
in (quantity, precision, Just decPt, Nothing)
NoSeparators digitGrp ->
let quantity = sign $ Decimal (fromIntegral precision)
(digitGroupNumber digitGrp)
precision = 0
in (quantity, precision, Nothing, Nothing)
AmbiguousNumber digitGrp1 sep digitGrp2 ->
-- If present, use the suggested style to disambiguate;
-- otherwise, assume that the separator is a decimal point where possible.
if isDecimalPointChar sep && maybe True (sep `isValidDecimalBy`) suggestedStyle
then -- Assuming that the separator is a decimal point
let quantity = sign $ Decimal (fromIntegral precision)
(digitGroupNumber $ digitGrp1 <> digitGrp2)
precision = digitGroupLength digitGrp2
in (quantity, precision, Just sep, Nothing)
else -- Assuming that the separator is digit separator
let quantity = sign $ Decimal (fromIntegral precision)
(digitGroupNumber $ digitGrp1 <> digitGrp2)
precision = 0
digitGroupStyle = Just $
DigitGroups sep (groupSizes $ [digitGrp1, digitGrp2])
in (quantity, precision, Nothing, digitGroupStyle)
DigitSeparators digitSep digitGrps ->
let quantity = sign $ Decimal (fromIntegral precision)
(digitGroupNumber $ mconcat digitGrps)
precision = 0
digitGroupStyle = Just $ DigitGroups digitSep (groupSizes digitGrps)
in (quantity, precision, Nothing, digitGroupStyle)
BothSeparators digitSep digitGrps decPt decimalGrp ->
let quantity =
sign $ Decimal (fromIntegral precision)
(digitGroupNumber $ mconcat digitGrps <> decimalGrp)
precision = digitGroupLength decimalGrp
digitGroupStyle = Just $ DigitGroups digitSep (groupSizes digitGrps)
in (quantity, precision, Just decPt, digitGroupStyle)
where
sign :: Decimal -> Decimal
sign = if negated then negate else id
-- Outputs digit group sizes from least significant to most significant
groupSizes :: [DigitGrp] -> [Int]
groupSizes digitGrps = reverse $ case map digitGroupLength digitGrps of
(a:b:cs) | a < b -> b:cs
gs -> gs
mgrps = (`DigitGroups` groupsizes) <$> mseparator
-- put the parts back together without digit group separators, get the precision and parse the value
repr = (if negated then "-" else "") ++ "0" ++ concat intparts ++ (if null frac then "" else "." ++ frac)
quantity = read repr
precision = length frac
isValidDecimalBy :: Char -> AmountStyle -> Bool
isValidDecimalBy c = \case
AmountStyle{asdecimalpoint = Just d} -> d == c
AmountStyle{asdigitgroups = Just (DigitGroups g _)} -> g /= c
AmountStyle{asprecision = 0} -> False
_ -> True
asdecimalcheck :: AmountStyle -> Char -> Bool
asdecimalcheck = \case
AmountStyle{asdecimalpoint = Just d} -> (d ==)
AmountStyle{asdigitgroups = Just (DigitGroups g _)} -> (g /=)
AmountStyle{asprecision = 0} -> const False
_ -> const True
-- | Pre-parse a number into parts for further interpretation.
-- Numbers may optionally have a period/comma decimal point
-- and/or comma/period/space digit group separators, but we don't
-- decide which is which here, just return the parts:
-- | Parse and interpret the structure of a number as far as possible
-- without external hints. Numbers are digit strings, possibly separated
-- into digit groups by one of two types of separators. (1) Numbers may
-- optionally have a decimal point, which may be either a period or comma.
-- (2) Numbers may optionally contain digit group separators, which must
-- all be either a period, a comma, or a space.
--
-- - the first separator char (period or comma or space) seen, if any
--
-- - the digit group(s), possibly several separated by the above char, occuring before..
--
-- - the second and last separator char, and following digit group, if any.
-- It is our task to deduce the identities of the decimal point and digit
-- separator characters, based on the allowed syntax. For instance, we
-- make use of the fact that a decimal point can occur at most once and
-- must succeed all digit group separators.
--
-- >>> parseTest rawnumberp "1,234,567.89"
-- (Just ',',["1","234","567"],Just ('.',"89"))
-- BothSeparators ',' ["1","234","567"] '.' "89"
-- >>> parseTest rawnumberp "1 000"
-- (Just ' ',["1","000"],Nothing)
rawnumberp :: TextParser m ( Maybe Char , [String] , Maybe (Char, String) )
rawnumberp = do
(firstSep, groups) <- option (Nothing, []) $ do
leadingDigits <- some digitChar
option (Nothing, [leadingDigits]) . try $ do
firstSep <- oneOf decimalPointChars <|> whitespaceChar
secondGroup <- some digitChar
otherGroups <- many $ try $ char firstSep *> some digitChar
return (Just firstSep, leadingDigits : secondGroup : otherGroups)
-- AmbiguousNumber "1" ' ' "000"
--
rawnumberp :: TextParser m RawNumber
rawnumberp = label "rawnumberp" $ do
rawNumber <- leadingDecimalPt <|> leadingDigits
let remSepChars = maybe decimalPointChars (`delete` decimalPointChars) firstSep
modifier
| null groups = fmap Just -- if no digits so far, we require at least some decimals
| otherwise = optional
-- Guard against mistyped numbers
notFollowedBy $ satisfy isDecimalPointChar <|> (char ' ' >> digitChar)
extraGroup <- modifier $ do
lastSep <- oneOf remSepChars
digits <- modifier $ some digitChar -- decimal separator allowed to be without digits if had some before
return (lastSep, fromMaybe [] digits)
return $ dbg8 "rawnumberp" rawNumber
-- make sure we didn't leading part of mistyped number
notFollowedBy $ oneOf decimalPointChars <|> (whitespaceChar >> digitChar)
where
return $ dbg8 "rawnumberp" (firstSep, groups, extraGroup)
<?> "rawnumberp"
leadingDecimalPt :: TextParser m RawNumber
leadingDecimalPt =
LeadingDecimalPt <$> satisfy isDecimalPointChar <*> pdigitgroup
decimalPointChars :: String
decimalPointChars = ".,"
leadingDigits :: TextParser m RawNumber
leadingDigits = do
grp1 <- pdigitgroup
withSeparators grp1 <|> trailingDecimalPt grp1 <|> pure (NoSeparators grp1)
-- | Parse a unicode char that represents any non-control space char (Zs general category).
whitespaceChar :: TextParser m Char
whitespaceChar = charCategory Space
withSeparators :: DigitGrp -> TextParser m RawNumber
withSeparators grp1 = do
(sep, grp2) <- try $ (,) <$> satisfy isDigitSeparatorChar <*> pdigitgroup
grps <- many $ try $ char sep *> pdigitgroup
let digitGroups = grp1 : grp2 : grps
withDecimalPt sep digitGroups <|> pure (withoutDecimalPt grp1 sep grp2 grps)
withDecimalPt :: Char -> [DigitGrp] -> TextParser m RawNumber
withDecimalPt digitSep digitGroups = do
decimalPt <- satisfy $ \c -> isDecimalPointChar c && c /= digitSep
decimalDigitGroup <- option mempty pdigitgroup
pure $ BothSeparators digitSep digitGroups decimalPt decimalDigitGroup
withoutDecimalPt :: DigitGrp -> Char -> DigitGrp -> [DigitGrp] -> RawNumber
withoutDecimalPt grp1 sep grp2 grps
| null grps = AmbiguousNumber grp1 sep grp2
| otherwise = DigitSeparators sep (grp1:grp2:grps)
trailingDecimalPt :: DigitGrp -> TextParser m RawNumber
trailingDecimalPt grp1 = do
decimalPt <- satisfy isDecimalPointChar
pure $ TrailingDecimalPt grp1 decimalPt
isDecimalPointChar :: Char -> Bool
isDecimalPointChar c = c == '.' || c == ','
isDigitSeparatorChar :: Char -> Bool
isDigitSeparatorChar c = isDecimalPointChar c || c == ' '
data DigitGrp = DigitGrp {
digitGroupLength :: Int,
digitGroupNumber :: Integer
} deriving (Eq)
instance Show DigitGrp where
show (DigitGrp len num)
| len > 0 = "\"" ++ padding ++ numStr ++ "\""
| otherwise = "\"\""
where numStr = show num
padding = replicate (len - length numStr) '0'
instance Sem.Semigroup DigitGrp where
DigitGrp l1 n1 <> DigitGrp l2 n2 = DigitGrp (l1 + l2) (n1 * 10^l2 + n2)
instance Monoid DigitGrp where
mempty = DigitGrp 0 0
mappend = (Sem.<>)
pdigitgroup :: TextParser m DigitGrp
pdigitgroup = label "digit group"
$ makeGroup <$> takeWhile1P (Just "digit") isDigit
where
makeGroup = uncurry DigitGrp . foldl' step (0, 0) . T.unpack
step (!l, !a) c = (l+1, a*10 + fromIntegral (digitToInt c))
data RawNumber
= LeadingDecimalPt Char DigitGrp -- .50
| TrailingDecimalPt DigitGrp Char -- 100.
| NoSeparators DigitGrp -- 100
| AmbiguousNumber DigitGrp Char DigitGrp -- 1,000
| DigitSeparators Char [DigitGrp] -- 1,000,000
| BothSeparators Char [DigitGrp] Char DigitGrp -- 1,000.50
deriving (Show, Eq)
-- test_numberp = do
-- let s `is` n = assertParseEqual (parseWithState mempty numberp s) n