mostly replace slow regexpr with regex-tdfa (fixes #189)

This commit is contained in:
Simon Michael 2014-07-06 14:03:28 -07:00
parent 4c509ff933
commit 3a16e6cfc7
5 changed files with 105 additions and 40 deletions

View File

@ -655,7 +655,7 @@ getEffectiveAssignment rules record f = lastMay $ assignmentsFor f
matcherMatches pats = patternMatches $ "(" ++ intercalate "|" pats ++ ")" matcherMatches pats = patternMatches $ "(" ++ intercalate "|" pats ++ ")"
where where
patternMatches :: Regexp -> Bool patternMatches :: Regexp -> Bool
patternMatches pat = regexMatchesCIRegexCompat pat csvline patternMatches pat = regexMatchesCI pat csvline
where where
csvline = intercalate "," record csvline = intercalate "," record

View File

@ -81,7 +81,7 @@ detect :: FilePath -> String -> Bool
detect f s detect f s
| f /= "-" = takeExtension f `elem` ['.':format, ".j"] -- from a file: yes if the extension is .journal or .j | f /= "-" = takeExtension f `elem` ['.':format, ".j"] -- from a file: yes if the extension is .journal or .j
-- from stdin: yes if we can see something that looks like a journal entry (digits in column 0 with the next line indented) -- from stdin: yes if we can see something that looks like a journal entry (digits in column 0 with the next line indented)
| otherwise = isJust $ regexMatch "^[0-9]+.*\n[ \t]+" s | otherwise = regexMatches "^[0-9]+.*\n[ \t]+" s
-- | Parse and post-process a "Journal" from hledger's journal file -- | Parse and post-process a "Journal" from hledger's journal file
-- format, or give an error. -- format, or give an error.

View File

@ -1,11 +1,28 @@
-- Regular expression helpers. {-# LANGUAGE ScopedTypeVariables #-}
-- Currently using mostly regexpr and some regex-tdfa. {-
-- Note many of these will die on malformed regexps.
Easy regular expression helpers.
These should
- have mnemonic names
- have simple monomorphic types
- work with strings
- support extended regular expressions
- support replacement
- support splitting
- support unicode
- be cross-platform, not requiring C libraries
They currently can
- die on malformed regexps
- be slow (regexpr)
-}
module Hledger.Utils.Regex ( module Hledger.Utils.Regex (
regexMatch -- regexMatch
,regexMatchCI -- ,regexMatchCI
,regexMatches regexMatches
,regexMatchesCI ,regexMatchesCI
,containsRegex ,containsRegex
,regexReplace ,regexReplace
@ -13,47 +30,89 @@ module Hledger.Utils.Regex (
,regexReplaceBy ,regexReplaceBy
,regexToCaseInsensitive ,regexToCaseInsensitive
,regexSplit ,regexSplit
,regexMatchesRegexCompat ,toRegex
,regexMatchesCIRegexCompat
) )
where where
import Data.Maybe
import Text.Regex.TDFA
import Text.RegexPR
-- regexMatch :: String -> String -> MatchFun Maybe import Data.List (foldl')
regexMatch r s = matchRegexPR r s import Text.RegexPR (splitRegexPR)
import Text.Regex.TDFA (
Regex, CompOption(..), ExecOption(..), defaultCompOpt, defaultExecOpt,
makeRegexOpts, AllMatches(getAllMatches), match, (=~)
)
-- regexMatchCI :: String -> String -> MatchFun Maybe
regexMatchCI r s = regexMatch (regexToCaseInsensitive r) s
regexMatches :: String -> String -> Bool type Regexp = String
regexMatches r s = isJust $ matchRegexPR r s type Replacement = String
regexMatchesCI :: String -> String -> Bool
regexMatchesCI r s = regexMatches (regexToCaseInsensitive r) s
containsRegex :: Regexp -> String -> Bool
containsRegex = regexMatchesCI containsRegex = regexMatchesCI
regexReplace :: String -> String -> String -> String regexToCaseInsensitive :: Regexp -> Regexp
regexReplace r repl s = gsubRegexPR r repl s
regexReplaceCI :: String -> String -> String -> String
regexReplaceCI r s = regexReplace (regexToCaseInsensitive r) s
regexReplaceBy :: String -> (String -> String) -> String -> String
regexReplaceBy r replfn s = gsubRegexPRBy r replfn s
regexToCaseInsensitive :: String -> String
regexToCaseInsensitive r = "(?i)"++ r regexToCaseInsensitive r = "(?i)"++ r
regexSplit :: String -> String -> [String] -- regexpr - may be slow
regexSplit :: Regexp -> String -> [Regexp]
regexSplit = splitRegexPR regexSplit = splitRegexPR
-- regex-compat (regex-posix) functions that perform better than regexpr. -- regexMatch :: Regexp -> String -> MatchFun Maybe
regexMatchesRegexCompat :: String -> String -> Bool -- regexMatch r s = matchRegexPR r s
regexMatchesRegexCompat = flip (=~)
regexMatchesCIRegexCompat :: String -> String -> Bool -- regexMatchCI :: Regexp -> String -> MatchFun Maybe
regexMatchesCIRegexCompat r = match (makeRegexOpts defaultCompOpt { multiline = True, caseSensitive = False, newSyntax = True } defaultExecOpt r) -- regexMatchCI r s = regexMatch (regexToCaseInsensitive r) s
-- regexMatches :: Regexp -> String -> Bool
-- regexMatches r s = isJust $ matchRegexPR r s
-- regexMatchesCI :: Regexp -> String -> Bool
-- regexMatchesCI r s = regexMatches (regexToCaseInsensitive r) s
-- regexReplace :: Regexp -> Replacement -> String -> String
-- regexReplace r repl s = gsubRegexPR r repl s
-- regexReplaceCI :: Regexp -> Replacement -> String -> String
-- regexReplaceCI r s = regexReplace (regexToCaseInsensitive r) s
-- regexReplaceBy :: Regexp -> (String -> Replacement) -> String -> String
-- regexReplaceBy r replfn s = gsubRegexPRBy r replfn s
-- regex-tdfa
compOpt :: CompOption
compOpt = defaultCompOpt
execOpt :: ExecOption
execOpt = defaultExecOpt
toRegex :: Regexp -> Regex
toRegex = makeRegexOpts compOpt execOpt
-- regexMatch' :: RegexContext Regexp String a => Regexp -> String -> a
-- regexMatch' r s = s =~ (toRegex r)
regexMatches :: Regexp -> String -> Bool
regexMatches = flip (=~)
regexMatchesCI :: Regexp -> String -> Bool
regexMatchesCI r = match (makeRegexOpts compOpt{caseSensitive=False} execOpt r)
regexReplace :: Regexp -> Replacement -> String -> String
regexReplace r repl = regexReplaceBy r (const repl)
regexReplaceCI :: Regexp -> Replacement -> String -> String
regexReplaceCI r s = regexReplace (regexToCaseInsensitive r) s
regexReplaceBy :: Regexp -> (String -> Replacement) -> String -> String
regexReplaceBy r = replaceAll (toRegex r)
-- from http://stackoverflow.com/questions/9071682/replacement-substition-with-haskell-regex-libraries
replaceAll :: Regex -> (String -> Replacement) -> String -> String
replaceAll re f s = start end
where
(_, end, start) = foldl' go (0, s, id) $ getAllMatches $ match re s
go (ind,read,write) (off,len) =
let (skip, start) = splitAt (off - ind) read
(matched, remaining) = splitAt len start
in (off + len, remaining, write . (skip++) . (f matched ++))

View File

@ -34,6 +34,8 @@ import System.Process (readProcessWithExitCode)
import System.Time (ClockTime, getClockTime, diffClockTimes, TimeDiff(TimeDiff)) import System.Time (ClockTime, getClockTime, diffClockTimes, TimeDiff(TimeDiff))
import Test.HUnit import Test.HUnit
import Text.Printf import Text.Printf
import Text.Regex.TDFA (match)
-- kludge - adapt to whichever directory version is installed, or when -- kludge - adapt to whichever directory version is installed, or when
-- cabal macros aren't available, assume the new directory -- cabal macros aren't available, assume the new directory
@ -179,7 +181,8 @@ safeGetDirectoryContents "" = getDirectoryContents "."
safeGetDirectoryContents fp = getDirectoryContents fp safeGetDirectoryContents fp = getDirectoryContents fp
-- | Does the second file represent a backup of the first, and if so which version is it ? -- | Does the second file represent a backup of the first, and if so which version is it ?
-- XXX nasty regex types intruding, add a simpler api to Hledger.Utils.Regex
backupNumber :: FilePath -> FilePath -> Maybe Int backupNumber :: FilePath -> FilePath -> Maybe Int
backupNumber f g = case regexMatch ("^" ++ f ++ "\\.([0-9]+)$") g of backupNumber f g = case match (toRegex ("^" ++ f ++ "\\.([0-9]+)$")) g of
Just (_, ((_,suffix):_)) -> readMay suffix (_::FilePath, _::FilePath, _::FilePath, [ext::FilePath]) -> readMay ext
_ -> Nothing _ -> Nothing

View File

@ -80,6 +80,7 @@ library
,old-time ,old-time
,parsec ,parsec
,process ,process
,regex-tdfa
,regexpr >= 0.5.1 ,regexpr >= 0.5.1
,safe >= 0.2 ,safe >= 0.2
-- http://www.yesodweb.com/blog/2014/04/consolidation-progress -- http://www.yesodweb.com/blog/2014/04/consolidation-progress
@ -149,6 +150,7 @@ executable hledger
,old-time ,old-time
,parsec ,parsec
,process ,process
,regex-tdfa
,regexpr >= 0.5.1 ,regexpr >= 0.5.1
,safe >= 0.2 ,safe >= 0.2
,shakespeare-text >= 1.0 && < 1.2 ,shakespeare-text >= 1.0 && < 1.2
@ -183,6 +185,7 @@ test-suite tests
, old-time , old-time
, parsec , parsec
, process , process
, regex-tdfa
, regexpr , regexpr
, safe , safe
, shakespeare-text >= 1.0 && < 1.2 , shakespeare-text >= 1.0 && < 1.2