hledger/hledger-lib/Hledger/Utils/Regex.hs
2020-08-31 12:04:45 +10:00

263 lines
10 KiB
Haskell

{-# LANGUAGE DeriveDataTypeable #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeSynonymInstances #-}
{-|
Easy regular expression helpers, currently based on regex-tdfa. These should:
- be cross-platform, not requiring C libraries
- support unicode
- support extended regular expressions
- support replacement, with backreferences etc.
- support splitting
- have mnemonic names
- have simple monomorphic types
- work with simple strings
Regex strings are automatically compiled into regular expressions the first
time they are seen, and these are cached. If you use a huge number of unique
regular expressions this might lead to increased memory usage. Several
functions have memoised variants (*Memo), which also trade space for time.
Currently two APIs are provided:
- The old partial one which will call error on any problem (eg with malformed
regexps). This comes from hledger's origin as a command-line tool.
- The new total one (with _ suffixes) which will return an error message. This
is better for long-running apps like hledger-web.
We are gradually replacing usage of the old API in hledger. Probably at some
point the suffixless names will be reclaimed for the new API.
Current limitations:
- (?i) and similar are not supported
-}
module Hledger.Utils.Regex (
-- * Regexp type and constructors
Regexp(reString)
,toRegex_
,toRegexCI_
,toRegex'
,toRegexCI'
-- * type aliases
,Replacement
,RegexError
-- * partial regex operations (may call error)
-- ,regexMatches
-- ,regexMatchesCI
-- ,regexReplaceCI
-- ,regexReplaceCIMemo
-- ,regexReplaceByCI
-- * total regex operations
,match
,regexReplace
,regexReplaceMemo_
-- ,replaceAllBy
-- ,regexMatches_
-- ,regexMatchesCI_
-- ,regexReplace_
-- ,regexReplaceCI_
-- ,regexReplaceMemo_
-- ,regexReplaceCIMemo_
,replaceAllBy
)
where
import Control.Arrow (first)
import Control.Monad (foldM)
import Data.Aeson (ToJSON(..), Value(String))
import Data.Array ((!), elems, indices)
import Data.Char (isDigit)
import Data.Data (Data(..), mkNoRepType)
import Data.List (foldl')
import Data.MemoUgly (memo)
import qualified Data.Text as T
import Text.Regex.TDFA (
Regex, CompOption(..), defaultCompOpt, defaultExecOpt,
makeRegexOptsM, AllMatches(getAllMatches), match, MatchText,
RegexLike(..), RegexMaker(..), RegexOptions(..), RegexContext(..)
)
import Hledger.Utils.UTF8IOCompat (error')
-- | Regular expression. Extended regular expression-ish syntax ? But does not support eg (?i) syntax.
data Regexp
= Regexp { reString :: String, reCompiled :: Regex }
| RegexpCI { reString :: String, reCompiled :: Regex }
instance Eq Regexp where
Regexp s1 _ == Regexp s2 _ = s1 == s2
RegexpCI s1 _ == RegexpCI s2 _ = s1 == s2
_ == _ = False
instance Ord Regexp where
Regexp s1 _ `compare` Regexp s2 _ = s1 `compare` s2
RegexpCI s1 _ `compare` RegexpCI s2 _ = s1 `compare` s2
Regexp _ _ `compare` RegexpCI _ _ = LT
RegexpCI _ _ `compare` Regexp _ _ = GT
instance Show Regexp where
showsPrec d (Regexp s _) = showString "Regexp " . showsPrec d s
showsPrec d (RegexpCI s _) = showString "RegexpCI " . showsPrec d s
instance Read Regexp where
readsPrec d ('R':'e':'g':'e':'x':'p':' ':xs) = map (first toRegex') $ readsPrec d xs
readsPrec d ('R':'e':'g':'e':'x':'p':'C':'I':' ':xs) = map (first toRegexCI') $ readsPrec d xs
readsPrec _ s = error' $ "read: Not a valid regex " ++ s
instance Data Regexp where
toConstr _ = error' "No toConstr for Regex"
gunfold _ _ = error' "No gunfold for Regex"
dataTypeOf _ = mkNoRepType "Hledger.Utils.Regex"
instance ToJSON Regexp where
toJSON (Regexp s _) = String . T.pack $ "Regexp " ++ s
toJSON (RegexpCI s _) = String . T.pack $ "RegexpCI " ++ s
instance RegexLike Regexp String where
matchOnce = matchOnce . reCompiled
matchAll = matchAll . reCompiled
matchCount = matchCount . reCompiled
matchTest = matchTest . reCompiled
matchAllText = matchAllText . reCompiled
matchOnceText = matchOnceText . reCompiled
instance RegexContext Regexp String String where
match = match . reCompiled
matchM = matchM . reCompiled
-- Convert a Regexp string to a compiled Regex, or return an error message.
toRegex_ :: String -> Either RegexError Regexp
toRegex_ = memo $ \s -> mkRegexErr s (Regexp s <$> makeRegexM s)
-- Like toRegex_, but make a case-insensitive Regex.
toRegexCI_ :: String -> Either RegexError Regexp
toRegexCI_ = memo $ \s -> mkRegexErr s (RegexpCI s <$> makeRegexOptsM defaultCompOpt{caseSensitive=False} defaultExecOpt s)
-- | Make a nice error message for a regexp error.
mkRegexErr :: String -> Maybe a -> Either RegexError a
mkRegexErr s = maybe (Left errmsg) Right
where errmsg = "this regular expression could not be compiled: " ++ s
-- Convert a Regexp string to a compiled Regex, throw an error
toRegex' :: String -> Regexp
toRegex' = either error' id . toRegex_
-- Like toRegex', but make a case-insensitive Regex.
toRegexCI' :: String -> Regexp
toRegexCI' = either error' id . toRegexCI_
-- | A replacement pattern. May include numeric backreferences (\N).
type Replacement = String
-- | An regular expression compilation/processing error message.
type RegexError = String
-- helpers
regexReplace :: Regexp -> Replacement -> String -> String
regexReplace re repl s = foldl (replaceMatch repl) s (reverse $ match (reCompiled re) s :: [MatchText String])
where
replaceMatch :: Replacement -> String -> MatchText String -> String
replaceMatch replpat s matchgroups = pre ++ repl ++ post
where
((_,(off,len)):_) = elems matchgroups -- groups should have 0-based indexes, and there should always be at least one, since this is a match
(pre, post') = splitAt off s
post = drop len post'
repl = replaceAllBy backrefRegex (lookupMatchGroup matchgroups) replpat
where
lookupMatchGroup :: MatchText String -> String -> String
lookupMatchGroup grps ('\\':s@(_:_)) | all isDigit s =
case read s of n | n `elem` indices grps -> fst (grps ! n)
-- PARTIAL:
_ -> error' $ "no match group exists for backreference \"\\"++s++"\""
lookupMatchGroup _ s = error' $ "lookupMatchGroup called on non-numeric-backreference \""++s++"\", shouldn't happen"
backrefRegex = toRegex' "\\\\[0-9]+" -- PARTIAL: should not error happen
--------------------------------------------------------------------------------
-- new total functions
-- | A memoising version of regexReplace_. Caches the result for each
-- search pattern, replacement pattern, target string tuple.
regexReplaceMemo_ :: Regexp -> Replacement -> String -> Either RegexError String
regexReplaceMemo_ re repl = memo (replaceRegexUnmemo_ re repl)
-- helpers:
-- Replace this regular expression with this replacement pattern in this
-- string, or return an error message.
replaceRegexUnmemo_ :: Regexp -> Replacement -> String -> Either RegexError String
replaceRegexUnmemo_ re repl s = foldM (replaceMatch_ repl) s (reverse $ match (reCompiled re) s :: [MatchText String])
where
-- Replace one match within the string with the replacement text
-- appropriate for this match. Or return an error message.
replaceMatch_ :: Replacement -> String -> MatchText String -> Either RegexError String
replaceMatch_ replpat s matchgroups =
erepl >>= \repl -> Right $ pre ++ repl ++ post
where
((_,(off,len)):_) = elems matchgroups -- groups should have 0-based indexes, and there should always be at least one, since this is a match
(pre, post') = splitAt off s
post = drop len post'
-- The replacement text: the replacement pattern with all
-- numeric backreferences replaced by the appropriate groups
-- from this match. Or an error message.
-- FIXME: Use makeRegex instead of toRegex_
erepl = replaceAllByM backrefRegex (lookupMatchGroup_ matchgroups) replpat
where
-- Given some match groups and a numeric backreference,
-- return the referenced group text, or an error message.
lookupMatchGroup_ :: MatchText String -> String -> Either RegexError String
lookupMatchGroup_ grps ('\\':s@(_:_)) | all isDigit s =
case read s of n | n `elem` indices grps -> Right $ fst (grps ! n)
_ -> Left $ "no match group exists for backreference \"\\"++s++"\""
lookupMatchGroup_ _ s = Left $ "lookupMatchGroup called on non-numeric-backreference \""++s++"\", shouldn't happen"
backrefRegex = toRegex' "\\\\[0-9]+" -- PARTIAL: should not happen
-- helpers
-- adapted from http://stackoverflow.com/questions/9071682/replacement-substition-with-haskell-regex-libraries:
-- Replace all occurrences of a regexp in a string, transforming each match
-- with the given pure function.
replaceAllBy :: Regexp -> (String -> String) -> String -> String
replaceAllBy re transform s = prependdone rest
where
(_, rest, prependdone) = foldl' go (0, s, id) matches
where
matches = getAllMatches $ match (reCompiled re) s :: [(Int, Int)] -- offset and length
go :: (Int,String,String->String) -> (Int,Int) -> (Int,String,String->String)
go (pos,todo,prepend) (off,len) =
let (prematch, matchandrest) = splitAt (off - pos) todo
(matched, rest) = splitAt len matchandrest
in (off + len, rest, prepend . (prematch++) . (transform matched ++))
-- Replace all occurrences of a regexp in a string, transforming each match
-- with the given monadic function. Eg if the monad is Either, a Left result
-- from the transform function short-circuits and is returned as the overall
-- result.
replaceAllByM :: forall m. Monad m => Regexp -> (String -> m String) -> String -> m String
replaceAllByM re transform s =
foldM go (0, s, id) matches >>= \(_, rest, prependdone) -> pure $ prependdone rest
where
matches = getAllMatches $ match (reCompiled re) s :: [(Int, Int)] -- offset and length
go :: (Int,String,String->String) -> (Int,Int) -> m (Int,String,String->String)
go (pos,todo,prepend) (off,len) =
let (prematch, matchandrest) = splitAt (off - pos) todo
(matched, rest) = splitAt len matchandrest
in transform matched >>= \matched' -> pure (off + len, rest, prepend . (prematch++) . (matched' ++))