lib: refactor the raw number parser [API change]
This commit is contained in:
parent
93fbac99d3
commit
121ba92ade
@ -101,12 +101,17 @@ import Control.Monad.State.Strict
|
|||||||
import Data.Bifunctor
|
import Data.Bifunctor
|
||||||
import Data.Char
|
import Data.Char
|
||||||
import Data.Data
|
import Data.Data
|
||||||
|
import Data.Decimal (DecimalRaw (Decimal), Decimal)
|
||||||
import Data.Default
|
import Data.Default
|
||||||
import Data.Functor.Identity
|
import Data.Functor.Identity
|
||||||
import Data.List.Compat
|
import Data.List.Compat
|
||||||
import Data.List.NonEmpty (NonEmpty(..))
|
import Data.List.NonEmpty (NonEmpty(..))
|
||||||
import Data.Maybe
|
import Data.Maybe
|
||||||
|
#if !(MIN_VERSION_base(4,11,0))
|
||||||
|
import Data.Monoid
|
||||||
|
#endif
|
||||||
import qualified Data.Map as M
|
import qualified Data.Map as M
|
||||||
|
import qualified Data.Semigroup as Sem
|
||||||
import Data.Text (Text)
|
import Data.Text (Text)
|
||||||
import qualified Data.Text as T
|
import qualified Data.Text as T
|
||||||
import Data.Time.Calendar
|
import Data.Time.Calendar
|
||||||
@ -709,93 +714,201 @@ exponentp = do
|
|||||||
return $ bimap (* 10^^exp) (max 0 . subtract exp)
|
return $ bimap (* 10^^exp) (max 0 . subtract exp)
|
||||||
<?> "exponentp"
|
<?> "exponentp"
|
||||||
|
|
||||||
-- | Interpret the raw parts of a number, using the provided amount style if any,
|
-- | Interpret a raw number as a decimal number, and identify the decimal
|
||||||
-- determining the decimal point character and digit groups where possible.
|
-- point charcter and digit separating scheme. There is only one ambiguous
|
||||||
|
-- case: when there is just a single separator between two digit groups.
|
||||||
|
-- Disambiguate using an amount style, if provided; otherwise, assume that
|
||||||
|
-- the separator is a decimal point.
|
||||||
|
--
|
||||||
-- Returns:
|
-- Returns:
|
||||||
-- - the decimal number
|
-- - the decimal number
|
||||||
-- - the precision (number of digits after the decimal point)
|
-- - the precision (number of digits after the decimal point)
|
||||||
-- - the decimal point character, if any
|
-- - the decimal point character, if any
|
||||||
-- - the digit group style, if any (digit group character and sizes of digit groups)
|
-- - the digit group style, if any (digit group character and sizes of digit groups)
|
||||||
fromRawNumber :: Maybe AmountStyle -> Bool -> (Maybe Char, [String], Maybe (Char, String)) -> (Quantity, Int, Maybe Char, Maybe DigitGroupStyle)
|
fromRawNumber
|
||||||
fromRawNumber suggestedStyle negated raw = (quantity, precision, mdecimalpoint, mgrps) where
|
:: Maybe AmountStyle
|
||||||
-- unpack with a hint if useful
|
-> Bool
|
||||||
(mseparator, intparts, mdecimalpoint, frac) =
|
-> RawNumber
|
||||||
case raw of
|
-> (Quantity, Int, Maybe Char, Maybe DigitGroupStyle)
|
||||||
-- If the number consists of exactly two digit groups
|
fromRawNumber suggestedStyle negated raw = case raw of
|
||||||
-- separated by a valid decimal point character, we assume
|
|
||||||
-- that the character represents a decimal point.
|
|
||||||
(Just s, [firstGroup, lastGroup], Nothing)
|
|
||||||
| s `elem` decimalPointChars && maybe True (`asdecimalcheck` s) suggestedStyle ->
|
|
||||||
(Nothing, [firstGroup], Just s, lastGroup)
|
|
||||||
|
|
||||||
(firstSep, digitGroups, Nothing) -> (firstSep, digitGroups, Nothing, [])
|
LeadingDecimalPt decPt digitGrp ->
|
||||||
(firstSep, digitGroups, Just (d, frac)) -> (firstSep, digitGroups, Just d, frac)
|
let quantity = sign $ Decimal (fromIntegral precision)
|
||||||
|
(digitGroupNumber digitGrp)
|
||||||
|
precision = digitGroupLength digitGrp
|
||||||
|
in (quantity, precision, Just decPt, Nothing)
|
||||||
|
|
||||||
-- get the digit group sizes and digit group style if any
|
TrailingDecimalPt digitGrp decPt ->
|
||||||
groupsizes = reverse $ case map length intparts of
|
let quantity = sign $ Decimal (fromIntegral precision)
|
||||||
(a:b:cs) | a < b -> b:cs
|
(digitGroupNumber digitGrp)
|
||||||
gs -> gs
|
precision = 0
|
||||||
mgrps = (`DigitGroups` groupsizes) <$> mseparator
|
in (quantity, precision, Just decPt, Nothing)
|
||||||
|
|
||||||
-- put the parts back together without digit group separators, get the precision and parse the value
|
NoSeparators digitGrp ->
|
||||||
repr = (if negated then "-" else "") ++ "0" ++ concat intparts ++ (if null frac then "" else "." ++ frac)
|
let quantity = sign $ Decimal (fromIntegral precision)
|
||||||
quantity = read repr
|
(digitGroupNumber digitGrp)
|
||||||
precision = length frac
|
precision = 0
|
||||||
|
in (quantity, precision, Nothing, Nothing)
|
||||||
|
|
||||||
asdecimalcheck :: AmountStyle -> Char -> Bool
|
AmbiguousNumber digitGrp1 sep digitGrp2 ->
|
||||||
asdecimalcheck = \case
|
-- If present, use the suggested style to disambiguate;
|
||||||
AmountStyle{asdecimalpoint = Just d} -> (d ==)
|
-- otherwise, assume that the separator is a decimal point where possible.
|
||||||
AmountStyle{asdigitgroups = Just (DigitGroups g _)} -> (g /=)
|
if isDecimalPointChar sep && maybe True (sep `isValidDecimalBy`) suggestedStyle
|
||||||
AmountStyle{asprecision = 0} -> const False
|
|
||||||
_ -> const True
|
|
||||||
|
|
||||||
-- | Pre-parse a number into parts for further interpretation.
|
then -- Assuming that the separator is a decimal point
|
||||||
-- Numbers may optionally have a period/comma decimal point
|
let quantity = sign $ Decimal (fromIntegral precision)
|
||||||
-- and/or comma/period/space digit group separators, but we don't
|
(digitGroupNumber $ digitGrp1 <> digitGrp2)
|
||||||
-- decide which is which here, just return the parts:
|
precision = digitGroupLength digitGrp2
|
||||||
|
in (quantity, precision, Just sep, Nothing)
|
||||||
|
|
||||||
|
else -- Assuming that the separator is digit separator
|
||||||
|
let quantity = sign $ Decimal (fromIntegral precision)
|
||||||
|
(digitGroupNumber $ digitGrp1 <> digitGrp2)
|
||||||
|
precision = 0
|
||||||
|
digitGroupStyle = Just $
|
||||||
|
DigitGroups sep (groupSizes $ [digitGrp1, digitGrp2])
|
||||||
|
in (quantity, precision, Nothing, digitGroupStyle)
|
||||||
|
|
||||||
|
DigitSeparators digitSep digitGrps ->
|
||||||
|
let quantity = sign $ Decimal (fromIntegral precision)
|
||||||
|
(digitGroupNumber $ mconcat digitGrps)
|
||||||
|
precision = 0
|
||||||
|
digitGroupStyle = Just $ DigitGroups digitSep (groupSizes digitGrps)
|
||||||
|
in (quantity, precision, Nothing, digitGroupStyle)
|
||||||
|
|
||||||
|
BothSeparators digitSep digitGrps decPt decimalGrp ->
|
||||||
|
let quantity =
|
||||||
|
sign $ Decimal (fromIntegral precision)
|
||||||
|
(digitGroupNumber $ mconcat digitGrps <> decimalGrp)
|
||||||
|
precision = digitGroupLength decimalGrp
|
||||||
|
digitGroupStyle = Just $ DigitGroups digitSep (groupSizes digitGrps)
|
||||||
|
in (quantity, precision, Just decPt, digitGroupStyle)
|
||||||
|
|
||||||
|
where
|
||||||
|
|
||||||
|
sign :: Decimal -> Decimal
|
||||||
|
sign = if negated then negate else id
|
||||||
|
|
||||||
|
-- Outputs digit group sizes from least significant to most significant
|
||||||
|
groupSizes :: [DigitGrp] -> [Int]
|
||||||
|
groupSizes digitGrps = reverse $ case map digitGroupLength digitGrps of
|
||||||
|
(a:b:cs) | a < b -> b:cs
|
||||||
|
gs -> gs
|
||||||
|
|
||||||
|
isValidDecimalBy :: Char -> AmountStyle -> Bool
|
||||||
|
isValidDecimalBy c = \case
|
||||||
|
AmountStyle{asdecimalpoint = Just d} -> d == c
|
||||||
|
AmountStyle{asdigitgroups = Just (DigitGroups g _)} -> g /= c
|
||||||
|
AmountStyle{asprecision = 0} -> False
|
||||||
|
_ -> True
|
||||||
|
|
||||||
|
|
||||||
|
-- | Parse and interpret the structure of a number as far as possible
|
||||||
|
-- without external hints. Numbers are digit strings, possibly separated
|
||||||
|
-- into digit groups by one of two types of separators. (1) Numbers may
|
||||||
|
-- optionally have a decimal point, which may be either a period or comma.
|
||||||
|
-- (2) Numbers may optionally contain digit group separators, which must
|
||||||
|
-- all be either a period, a comma, or a space.
|
||||||
--
|
--
|
||||||
-- - the first separator char (period or comma or space) seen, if any
|
-- It is our task to deduce the identities of the decimal point and digit
|
||||||
--
|
-- separator characters, based on the allowed syntax. For instance, we
|
||||||
-- - the digit group(s), possibly several separated by the above char, occuring before..
|
-- make use of the fact that a decimal point can occur at most once and
|
||||||
--
|
-- must succeed all digit group separators.
|
||||||
-- - the second and last separator char, and following digit group, if any.
|
|
||||||
--
|
--
|
||||||
-- >>> parseTest rawnumberp "1,234,567.89"
|
-- >>> parseTest rawnumberp "1,234,567.89"
|
||||||
-- (Just ',',["1","234","567"],Just ('.',"89"))
|
-- BothSeparators ',' ["1","234","567"] '.' "89"
|
||||||
-- >>> parseTest rawnumberp "1 000"
|
-- >>> parseTest rawnumberp "1 000"
|
||||||
-- (Just ' ',["1","000"],Nothing)
|
-- AmbiguousNumber "1" ' ' "000"
|
||||||
rawnumberp :: TextParser m ( Maybe Char , [String] , Maybe (Char, String) )
|
--
|
||||||
rawnumberp = do
|
rawnumberp :: TextParser m RawNumber
|
||||||
(firstSep, groups) <- option (Nothing, []) $ do
|
rawnumberp = label "rawnumberp" $ do
|
||||||
leadingDigits <- some digitChar
|
rawNumber <- leadingDecimalPt <|> leadingDigits
|
||||||
option (Nothing, [leadingDigits]) . try $ do
|
|
||||||
firstSep <- oneOf decimalPointChars <|> whitespaceChar
|
|
||||||
secondGroup <- some digitChar
|
|
||||||
otherGroups <- many $ try $ char firstSep *> some digitChar
|
|
||||||
return (Just firstSep, leadingDigits : secondGroup : otherGroups)
|
|
||||||
|
|
||||||
let remSepChars = maybe decimalPointChars (`delete` decimalPointChars) firstSep
|
-- Guard against mistyped numbers
|
||||||
modifier
|
notFollowedBy $ satisfy isDecimalPointChar <|> (char ' ' >> digitChar)
|
||||||
| null groups = fmap Just -- if no digits so far, we require at least some decimals
|
|
||||||
| otherwise = optional
|
|
||||||
|
|
||||||
extraGroup <- modifier $ do
|
return $ dbg8 "rawnumberp" rawNumber
|
||||||
lastSep <- oneOf remSepChars
|
|
||||||
digits <- modifier $ some digitChar -- decimal separator allowed to be without digits if had some before
|
|
||||||
return (lastSep, fromMaybe [] digits)
|
|
||||||
|
|
||||||
-- make sure we didn't leading part of mistyped number
|
where
|
||||||
notFollowedBy $ oneOf decimalPointChars <|> (whitespaceChar >> digitChar)
|
|
||||||
|
|
||||||
return $ dbg8 "rawnumberp" (firstSep, groups, extraGroup)
|
leadingDecimalPt :: TextParser m RawNumber
|
||||||
<?> "rawnumberp"
|
leadingDecimalPt =
|
||||||
|
LeadingDecimalPt <$> satisfy isDecimalPointChar <*> pdigitgroup
|
||||||
|
|
||||||
decimalPointChars :: String
|
leadingDigits :: TextParser m RawNumber
|
||||||
decimalPointChars = ".,"
|
leadingDigits = do
|
||||||
|
grp1 <- pdigitgroup
|
||||||
|
withSeparators grp1 <|> trailingDecimalPt grp1 <|> pure (NoSeparators grp1)
|
||||||
|
|
||||||
-- | Parse a unicode char that represents any non-control space char (Zs general category).
|
withSeparators :: DigitGrp -> TextParser m RawNumber
|
||||||
whitespaceChar :: TextParser m Char
|
withSeparators grp1 = do
|
||||||
whitespaceChar = charCategory Space
|
(sep, grp2) <- try $ (,) <$> satisfy isDigitSeparatorChar <*> pdigitgroup
|
||||||
|
grps <- many $ try $ char sep *> pdigitgroup
|
||||||
|
|
||||||
|
let digitGroups = grp1 : grp2 : grps
|
||||||
|
withDecimalPt sep digitGroups <|> pure (withoutDecimalPt grp1 sep grp2 grps)
|
||||||
|
|
||||||
|
withDecimalPt :: Char -> [DigitGrp] -> TextParser m RawNumber
|
||||||
|
withDecimalPt digitSep digitGroups = do
|
||||||
|
decimalPt <- satisfy $ \c -> isDecimalPointChar c && c /= digitSep
|
||||||
|
decimalDigitGroup <- option mempty pdigitgroup
|
||||||
|
|
||||||
|
pure $ BothSeparators digitSep digitGroups decimalPt decimalDigitGroup
|
||||||
|
|
||||||
|
withoutDecimalPt :: DigitGrp -> Char -> DigitGrp -> [DigitGrp] -> RawNumber
|
||||||
|
withoutDecimalPt grp1 sep grp2 grps
|
||||||
|
| null grps = AmbiguousNumber grp1 sep grp2
|
||||||
|
| otherwise = DigitSeparators sep (grp1:grp2:grps)
|
||||||
|
|
||||||
|
|
||||||
|
trailingDecimalPt :: DigitGrp -> TextParser m RawNumber
|
||||||
|
trailingDecimalPt grp1 = do
|
||||||
|
decimalPt <- satisfy isDecimalPointChar
|
||||||
|
pure $ TrailingDecimalPt grp1 decimalPt
|
||||||
|
|
||||||
|
|
||||||
|
isDecimalPointChar :: Char -> Bool
|
||||||
|
isDecimalPointChar c = c == '.' || c == ','
|
||||||
|
|
||||||
|
isDigitSeparatorChar :: Char -> Bool
|
||||||
|
isDigitSeparatorChar c = isDecimalPointChar c || c == ' '
|
||||||
|
|
||||||
|
|
||||||
|
data DigitGrp = DigitGrp {
|
||||||
|
digitGroupLength :: Int,
|
||||||
|
digitGroupNumber :: Integer
|
||||||
|
} deriving (Eq)
|
||||||
|
|
||||||
|
instance Show DigitGrp where
|
||||||
|
show (DigitGrp len num)
|
||||||
|
| len > 0 = "\"" ++ padding ++ numStr ++ "\""
|
||||||
|
| otherwise = "\"\""
|
||||||
|
where numStr = show num
|
||||||
|
padding = replicate (len - length numStr) '0'
|
||||||
|
|
||||||
|
instance Sem.Semigroup DigitGrp where
|
||||||
|
DigitGrp l1 n1 <> DigitGrp l2 n2 = DigitGrp (l1 + l2) (n1 * 10^l2 + n2)
|
||||||
|
|
||||||
|
instance Monoid DigitGrp where
|
||||||
|
mempty = DigitGrp 0 0
|
||||||
|
mappend = (Sem.<>)
|
||||||
|
|
||||||
|
pdigitgroup :: TextParser m DigitGrp
|
||||||
|
pdigitgroup = label "digit group"
|
||||||
|
$ makeGroup <$> takeWhile1P (Just "digit") isDigit
|
||||||
|
where
|
||||||
|
makeGroup = uncurry DigitGrp . foldl' step (0, 0) . T.unpack
|
||||||
|
step (!l, !a) c = (l+1, a*10 + fromIntegral (digitToInt c))
|
||||||
|
|
||||||
|
|
||||||
|
data RawNumber
|
||||||
|
= LeadingDecimalPt Char DigitGrp -- .50
|
||||||
|
| TrailingDecimalPt DigitGrp Char -- 100.
|
||||||
|
| NoSeparators DigitGrp -- 100
|
||||||
|
| AmbiguousNumber DigitGrp Char DigitGrp -- 1,000
|
||||||
|
| DigitSeparators Char [DigitGrp] -- 1,000,000
|
||||||
|
| BothSeparators Char [DigitGrp] Char DigitGrp -- 1,000.50
|
||||||
|
deriving (Show, Eq)
|
||||||
|
|
||||||
-- test_numberp = do
|
-- test_numberp = do
|
||||||
-- let s `is` n = assertParseEqual (parseWithState mempty numberp s) n
|
-- let s `is` n = assertParseEqual (parseWithState mempty numberp s) n
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user