lib: memoize "if blocks that assign filed f" for 50% CSV reader speedup
This commit is contained in:
		
							parent
							
								
									834e9ec104
								
							
						
					
					
						commit
						c397b90b5b
					
				| @ -53,6 +53,7 @@ import Data.Bifunctor             (first) | ||||
| import "base-compat-batteries" Data.List.Compat | ||||
| import qualified Data.List.Split as LS (splitOn) | ||||
| import Data.Maybe | ||||
| import Data.MemoUgly (memo) | ||||
| import Data.Ord | ||||
| import qualified Data.Set as S | ||||
| import Data.Text (Text) | ||||
| @ -164,29 +165,29 @@ defaultRulesText csvfile = T.pack $ unlines | ||||
|   ," account2 assets:bank:savings\n" | ||||
|   ] | ||||
| 
 | ||||
| addDirective :: (DirectiveName, String) -> CsvRules -> CsvRules | ||||
| addDirective :: (DirectiveName, String) -> CsvRulesParsed -> CsvRulesParsed | ||||
| addDirective d r = r{rdirectives=d:rdirectives r} | ||||
| 
 | ||||
| addAssignment :: (HledgerFieldName, FieldTemplate) -> CsvRules -> CsvRules | ||||
| addAssignment :: (HledgerFieldName, FieldTemplate) -> CsvRulesParsed -> CsvRulesParsed | ||||
| addAssignment a r = r{rassignments=a:rassignments r} | ||||
| 
 | ||||
| setIndexesAndAssignmentsFromList :: [CsvFieldName] -> CsvRules -> CsvRules | ||||
| setIndexesAndAssignmentsFromList :: [CsvFieldName] -> CsvRulesParsed -> CsvRulesParsed | ||||
| setIndexesAndAssignmentsFromList fs r = addAssignmentsFromList fs . setCsvFieldIndexesFromList fs $ r | ||||
| 
 | ||||
| setCsvFieldIndexesFromList :: [CsvFieldName] -> CsvRules -> CsvRules | ||||
| setCsvFieldIndexesFromList :: [CsvFieldName] -> CsvRulesParsed -> CsvRulesParsed | ||||
| setCsvFieldIndexesFromList fs r = r{rcsvfieldindexes=zip fs [1..]} | ||||
| 
 | ||||
| addAssignmentsFromList :: [CsvFieldName] -> CsvRules -> CsvRules | ||||
| addAssignmentsFromList :: [CsvFieldName] -> CsvRulesParsed -> CsvRulesParsed | ||||
| addAssignmentsFromList fs r = foldl' maybeAddAssignment r journalfieldnames | ||||
|   where | ||||
|     maybeAddAssignment rules f = (maybe id addAssignmentFromIndex $ elemIndex f fs) rules | ||||
|       where | ||||
|         addAssignmentFromIndex i = addAssignment (f, "%"++show (i+1)) | ||||
| 
 | ||||
| addConditionalBlock :: ConditionalBlock -> CsvRules -> CsvRules | ||||
| addConditionalBlock :: ConditionalBlock -> CsvRulesParsed -> CsvRulesParsed | ||||
| addConditionalBlock b r = r{rconditionalblocks=b:rconditionalblocks r} | ||||
| 
 | ||||
| addConditionalBlocks :: [ConditionalBlock] -> CsvRules -> CsvRules | ||||
| addConditionalBlocks :: [ConditionalBlock] -> CsvRulesParsed -> CsvRulesParsed | ||||
| addConditionalBlocks bs r = r{rconditionalblocks=bs++rconditionalblocks r} | ||||
| 
 | ||||
| getDirective :: DirectiveName -> CsvRules -> Maybe FieldTemplate | ||||
| @ -239,18 +240,42 @@ validateRules rules = do | ||||
| 
 | ||||
| -- | A set of data definitions and account-matching patterns sufficient to | ||||
| -- convert a particular CSV data file into meaningful journal transactions. | ||||
| data CsvRules = CsvRules { | ||||
| data CsvRules' a = CsvRules' { | ||||
|   rdirectives        :: [(DirectiveName,String)], | ||||
|     -- ^ top-level rules, as (keyword, value) pairs | ||||
|   rcsvfieldindexes   :: [(CsvFieldName, CsvFieldIndex)], | ||||
|     -- ^ csv field names and their column number, if declared by a fields list | ||||
|   rassignments       :: [(HledgerFieldName, FieldTemplate)], | ||||
|     -- ^ top-level assignments to hledger fields, as (field name, value template) pairs | ||||
|   rconditionalblocks :: [ConditionalBlock] | ||||
|   rconditionalblocks :: [ConditionalBlock], | ||||
|     -- ^ conditional blocks, which containing additional assignments/rules to apply to matched csv records | ||||
| } deriving (Show, Eq) | ||||
|   rblocksassigning :: a -- (String -> [ConditionalBlock]) | ||||
|     -- ^ all conditional blocks which can potentially assign field with a given name (memoized) | ||||
| } | ||||
| 
 | ||||
| type CsvRulesParser a = StateT CsvRules SimpleTextParser a | ||||
| -- | Type used by parsers. Directives, assignments and conditional blocks | ||||
| -- are in the reverse order compared to what is in the file and rblocksassigning is non-functional, | ||||
| -- could not be used for processing CSV records yet | ||||
| type CsvRulesParsed = CsvRules' () | ||||
| 
 | ||||
| -- | Type used after parsing is done. Directives, assignments and conditional blocks | ||||
| -- are in the same order as they were in the unput file and rblocksassigning is functional. | ||||
| -- Ready to be used for CSV record processing | ||||
| type CsvRules = CsvRules' (String -> [ConditionalBlock]) | ||||
| 
 | ||||
| instance Eq CsvRules where | ||||
|   r1 == r2 = (rdirectives r1, rcsvfieldindexes r1, rassignments r1) == | ||||
|              (rdirectives r2, rcsvfieldindexes r2, rassignments r2)  | ||||
| 
 | ||||
| -- It is used for debug output only | ||||
| instance Show CsvRules where | ||||
|   show r = "CsvRules { rdirectives=" ++ show (rdirectives r) ++ | ||||
|            ", rcsvfieldindexes=" ++ show (rcsvfieldindexes r) ++ | ||||
|            ", rassignments=" ++ show (rassignments r) ++ | ||||
|            ", rconditionalblocks="++ show (rconditionalblocks r) ++ | ||||
|            " }" | ||||
| 
 | ||||
| type CsvRulesParser a = StateT CsvRulesParsed SimpleTextParser a | ||||
| 
 | ||||
| -- | The keyword of a CSV rule - "fields", "skip", "if", etc. | ||||
| type DirectiveName    = String | ||||
| @ -296,13 +321,27 @@ data ConditionalBlock = CB { | ||||
|   ,cbAssignments :: [(HledgerFieldName, FieldTemplate)] | ||||
|   } deriving (Show, Eq) | ||||
| 
 | ||||
| defrules = CsvRules { | ||||
| defrules :: CsvRulesParsed | ||||
| defrules = CsvRules' { | ||||
|   rdirectives=[], | ||||
|   rcsvfieldindexes=[], | ||||
|   rassignments=[], | ||||
|   rconditionalblocks=[] | ||||
| } | ||||
|   rconditionalblocks=[], | ||||
|   rblocksassigning = () | ||||
|   } | ||||
| 
 | ||||
| -- | Create CsvRules from the content parsed out of the rules file | ||||
| mkrules :: CsvRulesParsed -> CsvRules | ||||
| mkrules rules = | ||||
|   let conditionalblocks = reverse $ rconditionalblocks rules in | ||||
|     CsvRules' { | ||||
|     rdirectives=reverse $ rdirectives rules, | ||||
|     rcsvfieldindexes=rcsvfieldindexes rules, | ||||
|     rassignments=reverse $ rassignments rules, | ||||
|     rconditionalblocks=conditionalblocks, | ||||
|     rblocksassigning = memo (\f -> filter (any ((==f).fst) . cbAssignments) conditionalblocks) | ||||
|     } | ||||
|    | ||||
| --- *** rules parsers | ||||
| 
 | ||||
| {- | ||||
| @ -382,10 +421,7 @@ rulesp = do | ||||
|     ] | ||||
|   eof | ||||
|   r <- get | ||||
|   return r{rdirectives=reverse $ rdirectives r | ||||
|           ,rassignments=reverse $ rassignments r | ||||
|           ,rconditionalblocks=reverse $ rconditionalblocks r | ||||
|           } | ||||
|   return $ mkrules r | ||||
| 
 | ||||
| blankorcommentlinep :: CsvRulesParser () | ||||
| blankorcommentlinep = lift (dbgparse 8 "trying blankorcommentlinep") >> choiceInState [blanklinep, commentlinep] | ||||
| @ -1105,10 +1141,8 @@ getEffectiveAssignment rules record f = lastMay $ map snd $ assignments | ||||
|         -- all top level field assignments | ||||
|         toplevelassignments    = rassignments rules | ||||
|         -- all field assignments in conditional blocks assigning to field f and active for the current csv record | ||||
|         conditionalassignments = concatMap cbAssignments $ filter isBlockActive $ blocksAssigning f | ||||
|         conditionalassignments = concatMap cbAssignments $ filter isBlockActive $ rblocksassigning rules f | ||||
|           where | ||||
|             -- all conditional blocks which can potentially assign field f | ||||
|             blocksAssigning f = filter (any ((==f).fst) . cbAssignments) $ rconditionalblocks rules | ||||
|             -- does this conditional block match the current csv record ? | ||||
|             isBlockActive :: ConditionalBlock -> Bool | ||||
|             isBlockActive CB{..} = any matcherMatches cbMatchers | ||||
| @ -1182,21 +1216,21 @@ parseDateWithCustomOrDefaultFormats mformat s = asum $ map parsewith formats | ||||
| tests_CsvReader = tests "CsvReader" [ | ||||
|    tests "parseCsvRules" [ | ||||
|      test "empty file" $ | ||||
|       parseCsvRules "unknown" "" @?= Right defrules | ||||
|       parseCsvRules "unknown" "" @?= Right (mkrules defrules) | ||||
|    ] | ||||
|   ,tests "rulesp" [ | ||||
|      test "trailing comments" $ | ||||
|       parseWithState' defrules rulesp "skip\n# \n#\n" @?= Right defrules{rdirectives = [("skip","")]} | ||||
|       parseWithState' defrules rulesp "skip\n# \n#\n" @?= Right (mkrules $ defrules{rdirectives = [("skip","")]}) | ||||
| 
 | ||||
|     ,test "trailing blank lines" $ | ||||
|       parseWithState' defrules rulesp "skip\n\n  \n" @?= (Right defrules{rdirectives = [("skip","")]}) | ||||
|       parseWithState' defrules rulesp "skip\n\n  \n" @?= (Right (mkrules $ defrules{rdirectives = [("skip","")]})) | ||||
| 
 | ||||
|     ,test "no final newline" $ | ||||
|       parseWithState' defrules rulesp "skip" @?= (Right defrules{rdirectives=[("skip","")]}) | ||||
|       parseWithState' defrules rulesp "skip" @?= (Right (mkrules $ defrules{rdirectives=[("skip","")]})) | ||||
| 
 | ||||
|     ,test "assignment with empty value" $ | ||||
|       parseWithState' defrules rulesp "account1 \nif foo\n  account2 foo\n" @?= | ||||
|         (Right defrules{rassignments = [("account1","")], rconditionalblocks = [CB{cbMatchers=[RecordMatcher "foo"],cbAssignments=[("account2","foo")]}]}) | ||||
|         (Right (mkrules $ defrules{rassignments = [("account1","")], rconditionalblocks = [CB{cbMatchers=[RecordMatcher "foo"],cbAssignments=[("account2","foo")]}]})) | ||||
|    ] | ||||
|   ,tests "conditionalblockp" [ | ||||
|     test "space after conditional" $ -- #1120 | ||||
| @ -1226,11 +1260,11 @@ tests_CsvReader = tests "CsvReader" [ | ||||
|    ] | ||||
| 
 | ||||
|   ,tests "getEffectiveAssignment" [ | ||||
|     let rules = defrules{rcsvfieldindexes=[("csvdate",1)],rassignments=[("date","%csvdate")]} | ||||
|     let rules = mkrules $ defrules {rcsvfieldindexes=[("csvdate",1)],rassignments=[("date","%csvdate")]} | ||||
|      | ||||
|     in test "toplevel" $ getEffectiveAssignment rules ["a","b"] "date" @?= (Just "%csvdate") | ||||
| 
 | ||||
|    ,let rules = defrules{rcsvfieldindexes=[("csvdate",1)], rconditionalblocks=[CB [FieldMatcher "%csvdate" "a"] [("date","%csvdate")]]} | ||||
|    ,let rules = mkrules $ defrules{rcsvfieldindexes=[("csvdate",1)], rconditionalblocks=[CB [FieldMatcher "%csvdate" "a"] [("date","%csvdate")]]} | ||||
|     in test "conditional" $ getEffectiveAssignment rules ["a","b"] "date" @?= (Just "%csvdate") | ||||
|         | ||||
|    ] | ||||
|  | ||||
| @ -833,6 +833,31 @@ expecting end of input, field assignment, or newline | ||||
| ) | ||||
| >=1 | ||||
| 
 | ||||
| # 42. Rules override each other in the order listed in the file | ||||
| < | ||||
| 10/2009/09,Flubber Co,50 | ||||
| 
 | ||||
| RULES | ||||
| fields date, description, amount | ||||
| date-format %d/%Y/%m | ||||
| currency $ | ||||
| account1 assets:myacct | ||||
| if Flubber | ||||
|   account2 foo | ||||
|   comment bar | ||||
| 
 | ||||
| if 10/2009/09.*Flubber | ||||
|   account2 acct | ||||
|   comment cmt | ||||
| 
 | ||||
| $  ./csvtest.sh | ||||
| 2009-09-10 Flubber Co  ; cmt | ||||
|     assets:myacct             $50 | ||||
|     acct                     $-50 | ||||
| 
 | ||||
| >=0 | ||||
| 
 | ||||
| 
 | ||||
| ## .  | ||||
| #< | ||||
| #$  ./csvtest.sh | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user