The regex helpers were converting strings to regex-tdfa regular expressions on the fly every time, but this appears to be quite expensive. The simplest memoisation lib seems to solve it nicely.
		
			
				
	
	
		
			137 lines
		
	
	
		
			4.3 KiB
		
	
	
	
		
			Haskell
		
	
	
	
	
	
			
		
		
	
	
			137 lines
		
	
	
		
			4.3 KiB
		
	
	
	
		
			Haskell
		
	
	
	
	
	
| {-# LANGUAGE ScopedTypeVariables #-}
 | |
| {-|
 | |
| 
 | |
| Easy regular expression helpers, currently based on regex-tdfa. These should:
 | |
| 
 | |
| - be cross-platform, not requiring C libraries
 | |
| 
 | |
| - support unicode
 | |
| 
 | |
| - support extended regular expressions
 | |
| 
 | |
| - support replacement, with backreferences etc.
 | |
| 
 | |
| - support splitting
 | |
| 
 | |
| - have mnemonic names
 | |
| 
 | |
| - have simple monomorphic types
 | |
| 
 | |
| - work with simple strings
 | |
| 
 | |
| Regex strings are automatically compiled into regular expressions the
 | |
| first time they are seen, and these are cached. If you use a huge
 | |
| number of unique regular expressions this might lead to increased
 | |
| memory usage.
 | |
| 
 | |
| Current limitations:
 | |
| 
 | |
| - (?i) and similar are not supported
 | |
| 
 | |
| -}
 | |
| 
 | |
| module Hledger.Utils.Regex (
 | |
|    -- * type aliases
 | |
|    Regexp
 | |
|   ,Replacement
 | |
|    -- * standard regex operations
 | |
|   ,regexMatches
 | |
|   ,regexMatchesCI
 | |
|   ,regexReplace
 | |
|   ,regexReplaceCI
 | |
|   ,regexReplaceBy
 | |
|   ,regexReplaceByCI
 | |
|   )
 | |
| where
 | |
| 
 | |
| import Data.Array
 | |
| import Data.Char
 | |
| import Data.List (foldl')
 | |
| import Data.MemoUgly (memo)
 | |
| import Text.Regex.TDFA (
 | |
|   Regex, CompOption(..), ExecOption(..), defaultCompOpt, defaultExecOpt,
 | |
|   makeRegexOpts, AllMatches(getAllMatches), match, (=~), MatchText
 | |
|   )
 | |
| 
 | |
| -- import Hledger.Utils.Debug
 | |
| import Hledger.Utils.UTF8IOCompat (error')
 | |
| 
 | |
| 
 | |
| -- | Regular expression. Extended regular expression-ish syntax ? But does not support eg (?i) syntax.
 | |
| type Regexp = String
 | |
| 
 | |
| -- | A replacement pattern. May include numeric backreferences (\N).
 | |
| type Replacement = String
 | |
| 
 | |
| -- | Convert our string-based regexps to real ones. Can fail if the
 | |
| -- string regexp is malformed.
 | |
| toRegex :: Regexp -> Regex
 | |
| toRegex = memo (makeRegexOpts compOpt execOpt)
 | |
| 
 | |
| toRegexCI :: Regexp -> Regex
 | |
| toRegexCI = memo (makeRegexOpts compOpt{caseSensitive=False} execOpt)
 | |
| 
 | |
| compOpt :: CompOption
 | |
| compOpt = defaultCompOpt
 | |
| 
 | |
| execOpt :: ExecOption
 | |
| execOpt = defaultExecOpt
 | |
| 
 | |
| -- regexMatch' :: RegexContext Regexp String a => Regexp -> String -> a
 | |
| -- regexMatch' r s = s =~ (toRegex r)
 | |
| 
 | |
| regexMatches :: Regexp -> String -> Bool
 | |
| regexMatches = flip (=~)
 | |
| 
 | |
| regexMatchesCI :: Regexp -> String -> Bool
 | |
| regexMatchesCI r = match (toRegexCI r)
 | |
| 
 | |
| -- | Replace all occurrences of the regexp, transforming each match with the given function.
 | |
| regexReplaceBy :: Regexp -> (String -> String) -> String -> String
 | |
| regexReplaceBy r = replaceAllBy (toRegex r)
 | |
| 
 | |
| regexReplaceByCI :: Regexp -> (String -> String) -> String -> String
 | |
| regexReplaceByCI r = replaceAllBy (toRegexCI r)
 | |
| 
 | |
| -- | Replace all occurrences of the regexp with the replacement
 | |
| -- pattern. The replacement pattern supports numeric backreferences
 | |
| -- (\N) but no other RE syntax.
 | |
| regexReplace :: Regexp -> Replacement -> String -> String
 | |
| regexReplace re = replaceRegex (toRegex re)
 | |
| 
 | |
| regexReplaceCI :: Regexp -> Replacement -> String -> String
 | |
| regexReplaceCI re = replaceRegex (toRegexCI re)
 | |
| 
 | |
| --
 | |
| 
 | |
| replaceRegex :: Regex -> Replacement -> String -> String
 | |
| replaceRegex re repl s = foldl (replaceMatch repl) s (reverse $ match re s :: [MatchText String])
 | |
| 
 | |
| replaceMatch :: Replacement -> String -> MatchText String -> String
 | |
| replaceMatch replpat s matchgroups = pre ++ repl ++ post
 | |
|   where
 | |
|     ((_,(off,len)):_) = elems matchgroups  -- groups should have 0-based indexes, and there should always be at least one, since this is a match
 | |
|     (pre, post') = splitAt off s
 | |
|     post = drop len post'
 | |
|     repl = replaceAllBy (toRegex "\\\\[0-9]+") (replaceBackReference matchgroups) replpat
 | |
| 
 | |
| replaceBackReference :: MatchText String -> String -> String
 | |
| replaceBackReference grps ('\\':s@(_:_)) | all isDigit s =
 | |
|   case read s of n | n `elem` indices grps -> fst (grps ! n)
 | |
|                  _                         -> error' $ "no match group exists for backreference \"\\"++s++"\""
 | |
| replaceBackReference _ s = error' $ "replaceBackReference called on non-numeric-backreference \""++s++"\", shouldn't happen"
 | |
| 
 | |
| --
 | |
| 
 | |
| -- http://stackoverflow.com/questions/9071682/replacement-substition-with-haskell-regex-libraries :
 | |
| -- | Replace all occurrences of a regexp in a string, transforming each match with the given function.
 | |
| replaceAllBy :: Regex -> (String -> String) -> String -> String
 | |
| replaceAllBy re f s = start end
 | |
|   where
 | |
|     (_, end, start) = foldl' go (0, s, id) $ (getAllMatches $ match re s :: [(Int, Int)])
 | |
|     go (ind,read,write) (off,len) =
 | |
|       let (skip, start) = splitAt (off - ind) read
 | |
|           (matched, remaining) = splitAt len start
 | |
|       in (off + len, remaining, write . (skip++) . (f matched ++))
 | |
| 
 |