diff --git a/hledger-lib/Hledger/Utils/Regex.hs b/hledger-lib/Hledger/Utils/Regex.hs index a319739d3..5426252a9 100644 --- a/hledger-lib/Hledger/Utils/Regex.hs +++ b/hledger-lib/Hledger/Utils/Regex.hs @@ -1,7 +1,18 @@ {-# LANGUAGE ScopedTypeVariables #-} {-| -Easy regular expression helpers, based on regex-tdfa and regexpr. These should +Easy regular expression helpers, based on regex-tdfa and (a little) on +regexpr. These should: + +- be cross-platform, not requiring C libraries + +- support unicode + +- support extended regular expressions + +- support replacement, with backreferences etc. + +- support splitting - have mnemonic names @@ -9,82 +20,55 @@ Easy regular expression helpers, based on regex-tdfa and regexpr. These should - work with strings -- support extended regular expressions +Current limitations: -- support replacement - -- support splitting - -- support unicode - -- be cross-platform, not requiring C libraries +- (?i) and similar are not supported -} module Hledger.Utils.Regex ( + -- * type aliases Regexp ,Replacement - -- ,regexMatch - -- ,regexMatchCI + -- * based on regex-tdfa ,regexMatches ,regexMatchesCI - ,containsRegex ,regexReplace ,regexReplaceCI ,regexReplaceBy - -- ,regexpToCI + ,regexReplaceByCI + -- * based on regexpr ,regexSplit - ,toRegex ) where +import Data.Array +import Data.Char import Data.List (foldl') import Text.RegexPR (splitRegexPR) import Text.Regex.TDFA ( Regex, CompOption(..), ExecOption(..), defaultCompOpt, defaultExecOpt, - makeRegexOpts, AllMatches(getAllMatches), match, (=~) + makeRegexOpts, AllMatches(getAllMatches), match, (=~), MatchText ) +-- import Hledger.Utils.Debug +import Hledger.Utils.UTF8IOCompat (error') + +-- | Regular expression. Extended regular expression-ish syntax ? But does not support eg (?i) syntax. type Regexp = String + +-- | A replacement pattern. May include backreferences (\N). type Replacement = String -containsRegex :: Regexp -> String -> Bool -containsRegex = regexMatchesCI - --- regexpr - may be slow - -regexSplit :: Regexp -> String -> [Regexp] -regexSplit = splitRegexPR - --- regexMatch :: Regexp -> String -> MatchFun Maybe --- regexMatch r s = matchRegexPR r s - --- regexMatchCI :: Regexp -> String -> MatchFun Maybe --- regexMatchCI r s = regexMatch (regexpToCI r) s - --- regexMatches :: Regexp -> String -> Bool --- regexMatches r s = isJust $ matchRegexPR r s - --- regexMatchesCI :: Regexp -> String -> Bool --- regexMatchesCI r s = regexMatches (regexpToCI r) s - --- regexReplace :: Regexp -> Replacement -> String -> String --- regexReplace r repl s = gsubRegexPR r repl s - --- regexReplaceCI :: Regexp -> Replacement -> String -> String --- regexReplaceCI r s = regexReplace (regexpToCI r) s - --- regexReplaceBy :: Regexp -> (String -> Replacement) -> String -> String --- regexReplaceBy r replfn s = gsubRegexPRBy r replfn s - --- regex-tdfa - -- | Convert our string-based regexps to real ones. Can fail if the -- string regexp is malformed. toRegex :: Regexp -> Regex toRegex = makeRegexOpts compOpt execOpt +toRegexCI :: Regexp -> Regex +toRegexCI = makeRegexOpts compOpt{caseSensitive=False} execOpt + compOpt :: CompOption compOpt = defaultCompOpt @@ -100,31 +84,48 @@ regexMatches = flip (=~) regexMatchesCI :: Regexp -> String -> Bool regexMatchesCI r = match (toRegexCI r) +-- | Replace all occurrences of the regexp, transforming each match with the given function. +regexReplaceBy :: Regexp -> (String -> String) -> String -> String +regexReplaceBy r = replaceAllBy (toRegex r) + +regexReplaceByCI :: Regexp -> (String -> String) -> String -> String +regexReplaceByCI r = replaceAllBy (toRegexCI r) + +-- | Replace all occurrences of the regexp with the replacement pattern. +-- The replacement pattern supports \N backreferences but no other RE syntax. regexReplace :: Regexp -> Replacement -> String -> String -regexReplace r repl = regexReplaceBy r (const repl) +regexReplace re = replaceRegex (toRegex re) regexReplaceCI :: Regexp -> Replacement -> String -> String -regexReplaceCI r repl = regexReplaceByCI r (const repl) +regexReplaceCI re = replaceRegex (toRegexCI re) -regexReplaceBy :: Regexp -> (String -> Replacement) -> String -> String -regexReplaceBy r = replaceAll (toRegex r) +-- -regexReplaceByCI :: Regexp -> (String -> Replacement) -> String -> String -regexReplaceByCI r = replaceAll (toRegexCI r) +replaceRegex :: Regex -> Replacement -> String -> String +replaceRegex re repl s = foldl (replaceMatch repl) s (reverse $ match re s :: [MatchText String]) -toRegexCI :: Regexp -> Regex -toRegexCI = makeRegexOpts compOpt{caseSensitive=False} execOpt +replaceMatch :: Replacement -> String -> MatchText String -> String +replaceMatch replpat s matchgroups = pre ++ repl ++ post + where + ((_,(off,len)):_) = elems matchgroups -- groups should have 0-based indexes, and there should always be at least one, since this is a match + (pre, post') = splitAt off s + post = drop len post' + repl = replaceAllBy (toRegex "\\\\[0-9]+") (replaceBackReference matchgroups) replpat --- regexpToCI :: Regexp -> Regexp --- regexpToCI r = "(?i)"++ r +replaceBackReference :: MatchText String -> String -> String +replaceBackReference grps ('\\':s@(_:_)) | all isDigit s = + case read s of n | n `elem` indices grps -> fst (grps ! n) + _ -> error' $ "no match group exists for backreference \"\\"++s++"\"" +replaceBackReference _ s = error' $ "replaceBackReference called on non-backreference \""++s++"\", shouldn't happen" --- from --- http://stackoverflow.com/questions/9071682/replacement-substition-with-haskell-regex-libraries +-- + +-- http://stackoverflow.com/questions/9071682/replacement-substition-with-haskell-regex-libraries : -- | Replace all occurrences of a regexp in a string using a replacer -- function, which receives the matched string as its argument. --- Does not support standard RE syntax such as \1. -replaceAll :: Regex -> (String -> Replacement) -> String -> String -replaceAll re f s = start end +-- Does not support backreferences or other RE syntax. +replaceAllBy :: Regex -> (String -> String) -> String -> String +replaceAllBy re f s = start end where (_, end, start) = foldl' go (0, s, id) $ getAllMatches $ match re s go (ind,read,write) (off,len) = @@ -132,3 +133,8 @@ replaceAll re f s = start end (matched, remaining) = splitAt len start in (off + len, remaining, write . (skip++) . (f matched ++)) +-- uses regexpr, may be slow: + +regexSplit :: Regexp -> String -> [Regexp] +regexSplit = splitRegexPR + diff --git a/hledger-lib/hledger-lib.cabal b/hledger-lib/hledger-lib.cabal index 5b1571257..bba7d59af 100644 --- a/hledger-lib/hledger-lib.cabal +++ b/hledger-lib/hledger-lib.cabal @@ -82,6 +82,7 @@ library Hledger.Utils.UTF8IOCompat build-depends: base >= 4.3 && < 5 + ,array ,blaze-markup >= 0.5.1 ,bytestring ,cmdargs >= 0.10 && < 0.11 @@ -116,6 +117,7 @@ test-suite tests default-language: Haskell2010 build-depends: hledger-lib , base >= 4.3 && < 5 + , array , blaze-markup >= 0.5.1 , cmdargs , containers diff --git a/hledger/Hledger/Cli/Utils.hs b/hledger/Hledger/Cli/Utils.hs index 507efd937..735050e9d 100644 --- a/hledger/Hledger/Cli/Utils.hs +++ b/hledger/Hledger/Cli/Utils.hs @@ -35,7 +35,7 @@ import System.Process (readProcessWithExitCode) import System.Time (ClockTime, getClockTime, diffClockTimes, TimeDiff(TimeDiff)) import Test.HUnit import Text.Printf -import Text.Regex.TDFA (match) +import Text.Regex.TDFA ((=~)) -- kludge - adapt to whichever directory version is installed, or when @@ -190,6 +190,6 @@ safeGetDirectoryContents fp = getDirectoryContents fp -- | Does the second file represent a backup of the first, and if so which version is it ? -- XXX nasty regex types intruding, add a simpler api to Hledger.Utils.Regex backupNumber :: FilePath -> FilePath -> Maybe Int -backupNumber f g = case match (toRegex ("^" ++ f ++ "\\.([0-9]+)$")) g of +backupNumber f g = case g =~ ("^" ++ f ++ "\\.([0-9]+)$") of (_::FilePath, _::FilePath, _::FilePath, [ext::FilePath]) -> readMay ext _ -> Nothing diff --git a/tests/misc/aliases.test b/tests/misc/aliases.test index 9e9c732ae..50d7db5c1 100644 --- a/tests/misc/aliases.test +++ b/tests/misc/aliases.test @@ -29,9 +29,9 @@ alias A (.)=\1 c -3 2011/01/01 - \1 1 - \1 2 - c -3 + a 1 + a 2 + c -3 >>>=0