From 9418807ddc4c8461fe6997ad6e4597943d62f995 Mon Sep 17 00:00:00 2001 From: Simon Michael Date: Fri, 11 Jul 2025 10:17:30 -0700 Subject: [PATCH 01/28] ;doc: include directive: clarify effect, glob limitations --- hledger/hledger.m4.md | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/hledger/hledger.m4.md b/hledger/hledger.m4.md index f82ff2282..fb77c5744 100644 --- a/hledger/hledger.m4.md +++ b/hledger/hledger.m4.md @@ -2509,29 +2509,33 @@ file contains digit group marks (eg thousands separators). You can pull in the content of additional files by writing an include directive, like this: ```journal -include FILEPATH +include SOMEFILE ``` -Only journal files can include, and only journal, timeclock or timedot files can be included (not CSV files, currently). +This has the same effect as if SOMEFILE's content was inlined at this point. +(Any `include`s within SOMEFILE will also be inlined, recursively.) -If the file path does not begin with a slash, it is relative to the current file's folder. +Only journal files can include other files. +They can include journal, timeclock or timedot files, but not CSV files. -A tilde means home directory, eg: `include ~/main.journal`. +If the file path begins with a tilde, that means home directory: `include ~/main.journal`. + +If it begins with a slash, it is an absolute path: `include /home/user/main.journal`. + +Otherwise it is relative to the current file's folder: `include finances/main.journal`. + +Also, the path may have a file type prefix to force a specific file format +(as described in [Data formats](#data-formats)): `include timedot:~/notes/2023*.md`. The path may contain [glob patterns] to match multiple files, eg: `include *.journal`. -There is limited support for recursive wildcards: `**/` (the slash is required) -matches 0 or more subdirectories. It's not super convenient since you have to -avoid include cycles and including directories, but this can be done, eg: -`include */**/*.journal`. - -The path may also be prefixed to force a specific file format, -overriding the file extension (as described in -[Data formats](#data-formats)): -`include timedot:~/notes/2023*.md`. +The special glob pattern `**/` matches any number of directory parts. +This is not robust; it can hang, and `**/*.journal` is rejected. +But this will work: `include */**/*.journal` (find all .journal files below the current directory). [glob patterns]: https://hackage.haskell.org/package/Glob-0.9.2/docs/System-FilePath-Glob.html#v:compile + ## `P` directive The `P` directive declares a market price, which is From 2815a1865f492015c4de186328c76eee5f7968b7 Mon Sep 17 00:00:00 2001 From: Simon Michael Date: Fri, 11 Jul 2025 10:49:19 -0700 Subject: [PATCH 02/28] dev: includedirectivep: cleanups, docs --- hledger-lib/Hledger/Read/JournalReader.hs | 76 ++++++++++++----------- 1 file changed, 41 insertions(+), 35 deletions(-) diff --git a/hledger-lib/Hledger/Read/JournalReader.hs b/hledger-lib/Hledger/Read/JournalReader.hs index aef71e90c..67fd6f0d5 100644 --- a/hledger-lib/Hledger/Read/JournalReader.hs +++ b/hledger-lib/Hledger/Read/JournalReader.hs @@ -304,32 +304,37 @@ includedirectivep = do Nothing -> paths Just fmt -> map ((show fmt++":")++) paths -- parse them inline - forM_ prefixedpaths $ parseChild parentpos + forM_ prefixedpaths $ parseIncludedFile parentpos where - getFilePaths - :: MonadIO m => Int -> SourcePos -> FilePath -> JournalParser m [FilePath] - getFilePaths parseroff parserpos fileglobpattern = do - -- Expand a ~ at the start of the glob pattern, if any. - fileglobpattern' <- lift $ expandHomePath fileglobpattern - `orRethrowIOError` (show parserpos ++ " locating " ++ fileglobpattern) - -- Compile the glob pattern. - fileglob <- case tryCompileWith compDefault{errorRecovery=False} fileglobpattern' of - Right x -> pure x - Left e -> customFailure $ parseErrorAt parseroff $ "Invalid glob pattern: " ++ e - -- Get the directory of the including file. This will be used to resolve relative paths. - let parentfilepath = sourceName parserpos - realparentfilepath <- liftIO $ canonicalizePath parentfilepath -- Follow a symlink. If the path is already absolute, the operation never fails. - let curdir = takeDirectory realparentfilepath - -- Find all matched files, in lexicographic order mimicking the output of 'ls'. - filepaths <- liftIO $ sort <$> globDir1 fileglob curdir - if (not . null) filepaths - then pure filepaths - else customFailure $ parseErrorAt parseroff $ - "No existing files match pattern: " ++ fileglobpattern - parseChild :: MonadIO m => SourcePos -> PrefixedFilePath -> ErroringJournalParser m () - parseChild parentpos prefixedpath = do + -- Find the files matched by a glob pattern, using the current parse context + -- for detecting the current directory and for error messages. + getFilePaths :: MonadIO m => Int -> SourcePos -> FilePath -> JournalParser m [FilePath] + getFilePaths parseroff parserpos fileglobpattern = do + -- Expand a ~ at the start of the glob pattern, if any. + fileglobpattern' <- lift $ expandHomePath fileglobpattern + `orRethrowIOError` (show parserpos ++ " locating " ++ fileglobpattern) + -- Compile the glob pattern. + fileglob <- case tryCompileWith compDefault{errorRecovery=False} fileglobpattern' of + Right x -> pure x + Left e -> customFailure $ parseErrorAt parseroff $ "Invalid glob pattern: " ++ e + -- Get the directory of the including file. This will be used to resolve relative paths. + let parentfilepath = sourceName parserpos + realparentfilepath <- liftIO $ canonicalizePath parentfilepath -- Follow a symlink. If the path is already absolute, the operation never fails. + let curdir = takeDirectory realparentfilepath + -- Find all matched files, in lexicographic order mimicking the output of 'ls'. + filepaths <- liftIO $ sort <$> globDir1 fileglob curdir + if (not . null) filepaths + then pure filepaths + else customFailure $ parseErrorAt parseroff $ + "No existing files match pattern: " ++ fileglobpattern + + -- Parse the given included file (and any deeper includes, recursively) + -- as if it was inlined in the current (parent) file. + -- The position in the parent file is provided for error messages. + parseIncludedFile :: MonadIO m => SourcePos -> PrefixedFilePath -> ErroringJournalParser m () + parseIncludedFile parentpos prefixedpath = do let (_mprefix,filepath) = splitReaderPrefix prefixedpath parentj <- get @@ -372,18 +377,19 @@ includedirectivep = do -- Update the parse state. put parentj' - newJournalWithParseStateFrom :: FilePath -> Journal -> Journal - newJournalWithParseStateFrom filepath j = nulljournal{ - jparsedefaultyear = jparsedefaultyear j - ,jparsedefaultcommodity = jparsedefaultcommodity j - ,jparseparentaccounts = jparseparentaccounts j - ,jparsedecimalmark = jparsedecimalmark j - ,jparsealiases = jparsealiases j - ,jdeclaredcommodities = jdeclaredcommodities j - -- ,jparsetransactioncount = jparsetransactioncount j - ,jparsetimeclockentries = jparsetimeclockentries j - ,jincludefilestack = filepath : jincludefilestack j - } + where + newJournalWithParseStateFrom :: FilePath -> Journal -> Journal + newJournalWithParseStateFrom filepath j = nulljournal{ + jparsedefaultyear = jparsedefaultyear j + ,jparsedefaultcommodity = jparsedefaultcommodity j + ,jparseparentaccounts = jparseparentaccounts j + ,jparsedecimalmark = jparsedecimalmark j + ,jparsealiases = jparsealiases j + ,jdeclaredcommodities = jdeclaredcommodities j + -- ,jparsetransactioncount = jparsetransactioncount j + ,jparsetimeclockentries = jparsetimeclockentries j + ,jincludefilestack = filepath : jincludefilestack j + } -- | Lift an IO action into the exception monad, rethrowing any IO -- error with the given message prepended. From 251af8529f803cc06490aeff9f65c6831e0422c2 Mon Sep 17 00:00:00 2001 From: Simon Michael Date: Fri, 11 Jul 2025 11:30:20 -0700 Subject: [PATCH 03/28] dev: Hledger.Utils.Parse: cleanups --- hledger-lib/Hledger/Utils/Parse.hs | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/hledger-lib/Hledger/Utils/Parse.hs b/hledger-lib/Hledger/Utils/Parse.hs index c5c0fc6d0..61fb2e6ce 100644 --- a/hledger-lib/Hledger/Utils/Parse.hs +++ b/hledger-lib/Hledger/Utils/Parse.hs @@ -454,22 +454,17 @@ customErrorBundlePretty errBundle = -- (1) it should be possible to convert any parse error into a "final" -- parse error, -- (2) it should be possible to take a parse error thrown from an include --- file and re-throw it in the parent file, and +-- file and re-throw it in the context of the parent file, and -- (3) the pretty-printing of "final" parse errors should be consistent --- with that of ordinary parse errors, but should also report a stack of --- files for errors thrown from include files. +-- with that of ordinary parse errors, but should also report the stack of +-- parent files when errors are thrown from included files. -- -- In order to pretty-print a "final" parse error (goal 3), it must be -- bundled with include filepaths and its full source text. When a "final" -- parse error is thrown from within a parser, we do not have access to --- the full source, so we must hold the parse error until it can be joined --- with its source (and include filepaths, if it was thrown from an --- include file) by the parser's caller. --- --- A parse error with include filepaths and its full source text is --- represented by the 'FinalParseErrorBundle' type, while a parse error in --- need of either include filepaths, full source text, or both is --- represented by the 'FinalParseError' type. +-- the full source, so we must hold the parse error ('FinalParseError') +-- until it can be combined with the full source (and any parent file paths) +-- by the parser's caller ('FinalParseErrorBundle'). data FinalParseError' e -- a parse error thrown as a "final" parse error From f5d3b7bd3877f8db7daacca82d0e0261a6cfda58 Mon Sep 17 00:00:00 2001 From: Simon Michael Date: Fri, 11 Jul 2025 11:39:01 -0700 Subject: [PATCH 04/28] fix:journal: include directive error messages now show the correct line They were showing the line after the include directive, confusingly. --- hledger-lib/Hledger/Read/JournalReader.hs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hledger-lib/Hledger/Read/JournalReader.hs b/hledger-lib/Hledger/Read/JournalReader.hs index 67fd6f0d5..78176942e 100644 --- a/hledger-lib/Hledger/Read/JournalReader.hs +++ b/hledger-lib/Hledger/Read/JournalReader.hs @@ -289,14 +289,14 @@ directivep = (do -- Examples: foo.j, ../foo/bar.j, timedot:/foo/2020*, *.journal includedirectivep :: MonadIO m => ErroringJournalParser m () includedirectivep = do + -- save the position + parentoff <- getOffset + parentpos <- getSourcePos -- parse string "include" lift skipNonNewlineSpaces1 prefixedglob <- rstrip . T.unpack <$> takeWhileP Nothing (`notElem` [';','\n']) lift followingcommentp - -- save the position (does sequencing wrt newline matter ? seems not) - parentoff <- getOffset - parentpos <- getSourcePos -- find file(s) let (mprefix,glb) = splitReaderPrefix prefixedglob paths <- getFilePaths parentoff parentpos glb From c8a5b8eb37faa67e10cf24c869b788dcbcc5fff9 Mon Sep 17 00:00:00 2001 From: Simon Michael Date: Fri, 11 Jul 2025 12:13:12 -0700 Subject: [PATCH 05/28] dev: includedirectivep: cleanup --- hledger-lib/Hledger/Read/JournalReader.hs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hledger-lib/Hledger/Read/JournalReader.hs b/hledger-lib/Hledger/Read/JournalReader.hs index 78176942e..2b4264996 100644 --- a/hledger-lib/Hledger/Read/JournalReader.hs +++ b/hledger-lib/Hledger/Read/JournalReader.hs @@ -290,8 +290,8 @@ directivep = (do includedirectivep :: MonadIO m => ErroringJournalParser m () includedirectivep = do -- save the position - parentoff <- getOffset - parentpos <- getSourcePos + off <- getOffset + pos <- getSourcePos -- parse string "include" lift skipNonNewlineSpaces1 @@ -299,12 +299,12 @@ includedirectivep = do lift followingcommentp -- find file(s) let (mprefix,glb) = splitReaderPrefix prefixedglob - paths <- getFilePaths parentoff parentpos glb + paths <- getFilePaths off pos glb let prefixedpaths = case mprefix of Nothing -> paths Just fmt -> map ((show fmt++":")++) paths -- parse them inline - forM_ prefixedpaths $ parseIncludedFile parentpos + forM_ prefixedpaths $ parseIncludedFile pos where From 801a7adaa4e40a089aacc9f83b169e190752eb04 Mon Sep 17 00:00:00 2001 From: Simon Michael Date: Fri, 11 Jul 2025 12:13:52 -0700 Subject: [PATCH 06/28] imp:include: better errors, eg for missing argument; more debug output --- hledger-lib/Hledger/Read/JournalReader.hs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/hledger-lib/Hledger/Read/JournalReader.hs b/hledger-lib/Hledger/Read/JournalReader.hs index 2b4264996..ad231377c 100644 --- a/hledger-lib/Hledger/Read/JournalReader.hs +++ b/hledger-lib/Hledger/Read/JournalReader.hs @@ -299,6 +299,8 @@ includedirectivep = do lift followingcommentp -- find file(s) let (mprefix,glb) = splitReaderPrefix prefixedglob + when (null $ dbg7 "glob pattern" glb) $ + customFailure $ parseErrorAt off $ "include needs a file path or glob pattern argument" paths <- getFilePaths off pos glb let prefixedpaths = case mprefix of Nothing -> paths @@ -323,12 +325,11 @@ includedirectivep = do let parentfilepath = sourceName parserpos realparentfilepath <- liftIO $ canonicalizePath parentfilepath -- Follow a symlink. If the path is already absolute, the operation never fails. let curdir = takeDirectory realparentfilepath - -- Find all matched files, in lexicographic order mimicking the output of 'ls'. - filepaths <- liftIO $ sort <$> globDir1 fileglob curdir + -- Find all matched files, in lexicographic order (the order ls would normally show them) + filepaths <- liftIO $ (dbg7 "include: matched files" . sort) <$> globDir1 fileglob curdir if (not . null) filepaths then pure filepaths - else customFailure $ parseErrorAt parseroff $ - "No existing files match pattern: " ++ fileglobpattern + else customFailure $ parseErrorAt parseroff $ "No files were matched by file pattern: " ++ fileglobpattern -- Parse the given included file (and any deeper includes, recursively) -- as if it was inlined in the current (parent) file. From b7c4dc3b5368522fe34d817fa9437bb193a594a7 Mon Sep 17 00:00:00 2001 From: Simon Michael Date: Fri, 11 Jul 2025 13:00:51 -0700 Subject: [PATCH 07/28] fix:journal: cyclic include error messages now show the correct line --- hledger-lib/Hledger/Read/JournalReader.hs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/hledger-lib/Hledger/Read/JournalReader.hs b/hledger-lib/Hledger/Read/JournalReader.hs index ad231377c..7504b12af 100644 --- a/hledger-lib/Hledger/Read/JournalReader.hs +++ b/hledger-lib/Hledger/Read/JournalReader.hs @@ -306,7 +306,7 @@ includedirectivep = do Nothing -> paths Just fmt -> map ((show fmt++":")++) paths -- parse them inline - forM_ prefixedpaths $ parseIncludedFile pos + forM_ prefixedpaths $ parseIncludedFile off pos where @@ -334,19 +334,20 @@ includedirectivep = do -- Parse the given included file (and any deeper includes, recursively) -- as if it was inlined in the current (parent) file. -- The position in the parent file is provided for error messages. - parseIncludedFile :: MonadIO m => SourcePos -> PrefixedFilePath -> ErroringJournalParser m () - parseIncludedFile parentpos prefixedpath = do + parseIncludedFile :: MonadIO m => Int -> SourcePos -> PrefixedFilePath -> ErroringJournalParser m () + parseIncludedFile off pos prefixedpath = do let (_mprefix,filepath) = splitReaderPrefix prefixedpath + -- Throw an error if a cycle is detected parentj <- get let parentfilestack = jincludefilestack parentj when (filepath `elem` parentfilestack) $ - Fail.fail ("Cyclic include: " ++ filepath) + customFailure $ parseErrorAt off $ "Cyclic include: " ++ filepath childInput <- dbg6Msg ("parseChild: "++takeFileName filepath) $ lift $ readFilePortably filepath - `orRethrowIOError` (show parentpos ++ " reading " ++ filepath) + `orRethrowIOError` (show pos ++ " reading " ++ filepath) let initChildj = newJournalWithParseStateFrom filepath parentj -- Choose a reader/parser based on the file path prefix or file extension, From b1f416dee72c7374d4aa4be465fc96a9ea311c3d Mon Sep 17 00:00:00 2001 From: Simon Michael Date: Fri, 11 Jul 2025 13:01:46 -0700 Subject: [PATCH 08/28] dev: parseIncludedFile: doc cleanup --- hledger-lib/Hledger/Read/JournalReader.hs | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/hledger-lib/Hledger/Read/JournalReader.hs b/hledger-lib/Hledger/Read/JournalReader.hs index 7504b12af..76b01eb38 100644 --- a/hledger-lib/Hledger/Read/JournalReader.hs +++ b/hledger-lib/Hledger/Read/JournalReader.hs @@ -344,28 +344,30 @@ includedirectivep = do when (filepath `elem` parentfilestack) $ customFailure $ parseErrorAt off $ "Cyclic include: " ++ filepath + -- Read this file's content, or throw an error childInput <- dbg6Msg ("parseChild: "++takeFileName filepath) $ lift $ readFilePortably filepath `orRethrowIOError` (show pos ++ " reading " ++ filepath) let initChildj = newJournalWithParseStateFrom filepath parentj - -- Choose a reader/parser based on the file path prefix or file extension, + -- Choose a reader based on the file path prefix or file extension, -- defaulting to JournalReader. Duplicating readJournal a bit here. let r = fromMaybe reader $ findReader Nothing (Just prefixedpath) parser = rParser r dbg6IO "parseChild: trying reader" (rFormat r) - -- Parse the file (of whichever format) to a Journal, with file path and source text attached. + -- Parse the file (and its own includes, if any) to a Journal + -- with file path and source text attached. Or throw an error. updatedChildj <- journalAddFile (filepath, childInput) <$> parseIncludeFile parser initChildj filepath childInput - -- Merge this child journal into the parent journal - -- (with debug logging for troubleshooting account display order). + -- Child journal was parsed successfully; now merge it into the parent journal. + -- Debug logging is provided for troubleshooting account display order (eg). -- The parent journal is the second argument to journalConcat; this means -- its parse state is kept, and its lists are appended to child's (which -- ultimately produces the right list order, because parent's and child's - -- lists are in reverse order at this stage. Cf #1909). + -- lists are in reverse order at this stage. Cf #1909) let parentj' = dbgJournalAcctDeclOrder ("parseChild: child " <> childfilename <> " acct decls: ") updatedChildj @@ -376,7 +378,7 @@ includedirectivep = do childfilename = takeFileName filepath parentfilename = maybe "(unknown)" takeFileName $ headMay $ jincludefilestack parentj -- XXX more accurate than journalFilePath for some reason - -- Update the parse state. + -- And update the current parse state. put parentj' where From b71e001c514615bc41edde9b15c6b75c013d83d1 Mon Sep 17 00:00:00 2001 From: Simon Michael Date: Fri, 11 Jul 2025 13:02:19 -0700 Subject: [PATCH 09/28] imp: include: more robust ** patterns, and ignore dotted directories ** now ignores anything under dotted directories, ie directories whose name begins with a dot. Eg .git/, foo/.secret/, etc. Switched from Glob to filepattern lib. --- hledger-lib/Hledger/Read/JournalReader.hs | 41 ++++++++++++++++++++--- hledger-lib/hledger-lib.cabal | 3 ++ hledger-lib/package.yaml | 1 + 3 files changed, 40 insertions(+), 5 deletions(-) diff --git a/hledger-lib/Hledger/Read/JournalReader.hs b/hledger-lib/Hledger/Read/JournalReader.hs index 76b01eb38..c133e72b5 100644 --- a/hledger-lib/Hledger/Read/JournalReader.hs +++ b/hledger-lib/Hledger/Read/JournalReader.hs @@ -94,6 +94,7 @@ import Text.Megaparsec.Char import Text.Printf import System.FilePath import "Glob" System.FilePath.Glob hiding (match) +import "filepattern" System.FilePattern.Directory import Hledger.Data import Hledger.Read.Common @@ -301,7 +302,7 @@ includedirectivep = do let (mprefix,glb) = splitReaderPrefix prefixedglob when (null $ dbg7 "glob pattern" glb) $ customFailure $ parseErrorAt off $ "include needs a file path or glob pattern argument" - paths <- getFilePaths off pos glb + paths <- getFilePaths2 off pos glb let prefixedpaths = case mprefix of Nothing -> paths Just fmt -> map ((show fmt++":")++) paths @@ -310,10 +311,11 @@ includedirectivep = do where - -- Find the files matched by a glob pattern, using the current parse context - -- for detecting the current directory and for error messages. - getFilePaths :: MonadIO m => Int -> SourcePos -> FilePath -> JournalParser m [FilePath] - getFilePaths parseroff parserpos fileglobpattern = do + -- XXX keep providing via --old-glob for a bit ? + -- Find the files matched by a glob pattern, using Glob. + -- Uses the current parse context for detecting the current directory and for error messages. + _getFilePaths1 :: MonadIO m => Int -> SourcePos -> FilePath -> JournalParser m [FilePath] + _getFilePaths1 parseroff parserpos fileglobpattern = do -- Expand a ~ at the start of the glob pattern, if any. fileglobpattern' <- lift $ expandHomePath fileglobpattern `orRethrowIOError` (show parserpos ++ " locating " ++ fileglobpattern) @@ -331,6 +333,35 @@ includedirectivep = do then pure filepaths else customFailure $ parseErrorAt parseroff $ "No files were matched by file pattern: " ++ fileglobpattern + -- Find the files matched by a glob pattern, using filepattern. + -- Uses the current parse context for detecting the current directory and for error messages. + -- This one also ignores all dotted directories (anything under .git/, foo/.secret/, etc.) + getFilePaths2 :: MonadIO m => Int -> SourcePos -> FilePath -> JournalParser m [FilePath] + getFilePaths2 off pos globpattern = do + + -- expand a tilde at the start of the glob pattern, or throw an error + expandedglob <- lift $ expandHomePath globpattern + `orRethrowIOError` (show pos ++ " locating " ++ globpattern) + + -- get the directory of the including file, to resolve relative paths + let parentfilepath = sourceName pos + realparentfilepath <- liftIO $ canonicalizePath parentfilepath -- Follow a symlink. If the path is already absolute, the operation never fails. + let cwd = takeDirectory realparentfilepath + + -- find all matched files, in lexicographic order (the order ls would normally show them) + filepaths <- liftIO $ + dbg7 "include: matched files" + . map (cwd ) + -- . sort -- XXX needed ? + <$> + getDirectoryFilesIgnore cwd [expandedglob] ["**/.*/**"] + + -- throw an error if no files matched + when (null filepaths) $ + customFailure $ parseErrorAt off $ "No files were matched by file pattern: " ++ globpattern + + pure filepaths + -- Parse the given included file (and any deeper includes, recursively) -- as if it was inlined in the current (parent) file. -- The position in the parent file is provided for error messages. diff --git a/hledger-lib/hledger-lib.cabal b/hledger-lib/hledger-lib.cabal index c3bf27760..0a1328956 100644 --- a/hledger-lib/hledger-lib.cabal +++ b/hledger-lib/hledger-lib.cabal @@ -148,6 +148,7 @@ library , extra >=1.7.11 , file-embed >=0.0.10 , filepath + , filepattern >=0.1.3 , hashtables >=1.2.3.1 , lucid , megaparsec >=7.0.0 && <9.8 @@ -208,6 +209,7 @@ test-suite doctest , extra >=1.7.11 , file-embed >=0.0.10 , filepath + , filepattern >=0.1.3 , hashtables >=1.2.3.1 , lucid , megaparsec >=7.0.0 && <9.8 @@ -269,6 +271,7 @@ test-suite unittest , extra >=1.7.11 , file-embed >=0.0.10 , filepath + , filepattern >=0.1.3 , hashtables >=1.2.3.1 , hledger-lib , lucid diff --git a/hledger-lib/package.yaml b/hledger-lib/package.yaml index e58cd9041..eec388e40 100644 --- a/hledger-lib/package.yaml +++ b/hledger-lib/package.yaml @@ -94,6 +94,7 @@ dependencies: - utf8-string >=0.3.5 - extra >=1.7.11 - Glob >= 0.9 +- filepattern >=0.1.3 ghc-options: - -Wall From 536589e2c2975fb43d8cebd3bcd730e037b82223 Mon Sep 17 00:00:00 2001 From: Simon Michael Date: Fri, 11 Jul 2025 13:32:06 -0700 Subject: [PATCH 10/28] imp: include: improve cycle and read failure error messages --- hledger-lib/Hledger/Read/JournalReader.hs | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/hledger-lib/Hledger/Read/JournalReader.hs b/hledger-lib/Hledger/Read/JournalReader.hs index c133e72b5..78b526d13 100644 --- a/hledger-lib/Hledger/Read/JournalReader.hs +++ b/hledger-lib/Hledger/Read/JournalReader.hs @@ -366,27 +366,24 @@ includedirectivep = do -- as if it was inlined in the current (parent) file. -- The position in the parent file is provided for error messages. parseIncludedFile :: MonadIO m => Int -> SourcePos -> PrefixedFilePath -> ErroringJournalParser m () - parseIncludedFile off pos prefixedpath = do + parseIncludedFile off _pos prefixedpath = do let (_mprefix,filepath) = splitReaderPrefix prefixedpath -- Throw an error if a cycle is detected parentj <- get let parentfilestack = jincludefilestack parentj - when (filepath `elem` parentfilestack) $ - customFailure $ parseErrorAt off $ "Cyclic include: " ++ filepath + when (dbg7 "parseIncludedFile: reading" filepath `elem` parentfilestack) $ + customFailure $ parseErrorAt off $ "This included file forms a cycle: " ++ filepath - -- Read this file's content, or throw an error - childInput <- - dbg6Msg ("parseChild: "++takeFileName filepath) $ - lift $ readFilePortably filepath - `orRethrowIOError` (show pos ++ " reading " ++ filepath) + -- Read the file's content, or throw an error + childInput <- lift $ readFilePortably filepath `orRethrowIOError` "reading a file failed" let initChildj = newJournalWithParseStateFrom filepath parentj -- Choose a reader based on the file path prefix or file extension, -- defaulting to JournalReader. Duplicating readJournal a bit here. let r = fromMaybe reader $ findReader Nothing (Just prefixedpath) parser = rParser r - dbg6IO "parseChild: trying reader" (rFormat r) + dbg6IO "parseIncludedFile: trying reader" (rFormat r) -- Parse the file (and its own includes, if any) to a Journal -- with file path and source text attached. Or throw an error. From 08017366b5e382c03c397906a7718d4365afc0b1 Mon Sep 17 00:00:00 2001 From: Simon Michael Date: Fri, 11 Jul 2025 13:48:58 -0700 Subject: [PATCH 11/28] imp: file reading: demote some debug=6 output to level 7 --- hledger-lib/Hledger/Data/Journal.hs | 4 ++-- hledger-lib/Hledger/Read/JournalReader.hs | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/hledger-lib/Hledger/Data/Journal.hs b/hledger-lib/Hledger/Data/Journal.hs index 2d05de113..14732acef 100644 --- a/hledger-lib/Hledger/Data/Journal.hs +++ b/hledger-lib/Hledger/Data/Journal.hs @@ -308,10 +308,10 @@ journalRenumberAccountDeclarations j = j{jdeclaredaccounts=jdas'} -- it seems unneeded except perhaps for debugging -- | Debug log the ordering of a journal's account declarations --- (at debug level 5+). +-- (at debug level 7+). dbgJournalAcctDeclOrder :: String -> Journal -> Journal dbgJournalAcctDeclOrder prefix = - dbg5With ((prefix++) . showAcctDeclsSummary . jdeclaredaccounts) + dbg7With ((prefix++) . showAcctDeclsSummary . jdeclaredaccounts) where showAcctDeclsSummary :: [(AccountName,AccountDeclarationInfo)] -> String showAcctDeclsSummary adis diff --git a/hledger-lib/Hledger/Read/JournalReader.hs b/hledger-lib/Hledger/Read/JournalReader.hs index 78b526d13..725c7b1fb 100644 --- a/hledger-lib/Hledger/Read/JournalReader.hs +++ b/hledger-lib/Hledger/Read/JournalReader.hs @@ -300,7 +300,7 @@ includedirectivep = do lift followingcommentp -- find file(s) let (mprefix,glb) = splitReaderPrefix prefixedglob - when (null $ dbg7 "glob pattern" glb) $ + when (null $ dbg6 "include: glob pattern" glb) $ customFailure $ parseErrorAt off $ "include needs a file path or glob pattern argument" paths <- getFilePaths2 off pos glb let prefixedpaths = case mprefix of @@ -328,7 +328,7 @@ includedirectivep = do realparentfilepath <- liftIO $ canonicalizePath parentfilepath -- Follow a symlink. If the path is already absolute, the operation never fails. let curdir = takeDirectory realparentfilepath -- Find all matched files, in lexicographic order (the order ls would normally show them) - filepaths <- liftIO $ (dbg7 "include: matched files" . sort) <$> globDir1 fileglob curdir + filepaths <- liftIO $ (dbg6 "include: matched files" . sort) <$> globDir1 fileglob curdir if (not . null) filepaths then pure filepaths else customFailure $ parseErrorAt parseroff $ "No files were matched by file pattern: " ++ fileglobpattern @@ -350,7 +350,7 @@ includedirectivep = do -- find all matched files, in lexicographic order (the order ls would normally show them) filepaths <- liftIO $ - dbg7 "include: matched files" + dbg6 "include: matched files" . map (cwd ) -- . sort -- XXX needed ? <$> @@ -383,7 +383,7 @@ includedirectivep = do -- defaulting to JournalReader. Duplicating readJournal a bit here. let r = fromMaybe reader $ findReader Nothing (Just prefixedpath) parser = rParser r - dbg6IO "parseIncludedFile: trying reader" (rFormat r) + dbg7IO "parseIncludedFile: trying reader" (rFormat r) -- Parse the file (and its own includes, if any) to a Journal -- with file path and source text attached. Or throw an error. @@ -549,7 +549,7 @@ commoditydirectiveonelinep = do pure $ (off, amt) lift skipNonNewlineSpaces _ <- lift followingcommentp - let comm = Commodity{csymbol=acommodity, cformat=Just $ dbg6 "style from commodity directive" astyle} + let comm = Commodity{csymbol=acommodity, cformat=Just $ dbg7 "style from commodity directive" astyle} if isNothing $ asdecimalmark astyle then customFailure $ parseErrorAt off pleaseincludedecimalpoint else modify' (\j -> j{jdeclaredcommodities=M.insert acommodity comm $ jdeclaredcommodities j}) @@ -595,7 +595,7 @@ formatdirectivep expectedsym = do then if isNothing $ asdecimalmark astyle then customFailure $ parseErrorAt off pleaseincludedecimalpoint - else return $ dbg6 "style from format subdirective" astyle + else return $ dbg7 "style from format subdirective" astyle else customFailure $ parseErrorAt off $ printf "commodity directive symbol \"%s\" and format directive symbol \"%s\" should be the same" expectedsym acommodity From 0add2e90dba26f785a5bc46dfe24abeb5039d812 Mon Sep 17 00:00:00 2001 From: Simon Michael Date: Fri, 11 Jul 2025 14:00:38 -0700 Subject: [PATCH 12/28] imp: include: glob patterns always exclude the current file Eg include **/*.journal is less likely no complain --- hledger-lib/Hledger/Read/JournalReader.hs | 16 ++++++++++------ hledger/hledger.m4.md | 9 ++++++--- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/hledger-lib/Hledger/Read/JournalReader.hs b/hledger-lib/Hledger/Read/JournalReader.hs index 725c7b1fb..ee473f153 100644 --- a/hledger-lib/Hledger/Read/JournalReader.hs +++ b/hledger-lib/Hledger/Read/JournalReader.hs @@ -333,7 +333,7 @@ includedirectivep = do then pure filepaths else customFailure $ parseErrorAt parseroff $ "No files were matched by file pattern: " ++ fileglobpattern - -- Find the files matched by a glob pattern, using filepattern. + -- Find the files matched by a glob pattern, if any, using filepattern. -- Uses the current parse context for detecting the current directory and for error messages. -- This one also ignores all dotted directories (anything under .git/, foo/.secret/, etc.) getFilePaths2 :: MonadIO m => Int -> SourcePos -> FilePath -> JournalParser m [FilePath] @@ -348,19 +348,23 @@ includedirectivep = do realparentfilepath <- liftIO $ canonicalizePath parentfilepath -- Follow a symlink. If the path is already absolute, the operation never fails. let cwd = takeDirectory realparentfilepath - -- find all matched files, in lexicographic order (the order ls would normally show them) + -- Find all matched files, in lexicographic order (the order ls would normally show them). + -- (This might include the current file.) filepaths <- liftIO $ - dbg6 "include: matched files" - . map (cwd ) + map (cwd ) -- . sort -- XXX needed ? <$> getDirectoryFilesIgnore cwd [expandedglob] ["**/.*/**"] - -- throw an error if no files matched + -- Throw an error if no files (not even the current file) were matched. when (null filepaths) $ customFailure $ parseErrorAt off $ "No files were matched by file pattern: " ++ globpattern - pure filepaths + -- If the current file was matched, exclude it now. + let filepaths' = filter (/= realparentfilepath) filepaths + dbg6IO "include: matched files (excluding current file)" filepaths' + + pure filepaths' -- Parse the given included file (and any deeper includes, recursively) -- as if it was inlined in the current (parent) file. diff --git a/hledger/hledger.m4.md b/hledger/hledger.m4.md index fb77c5744..be17d1ca5 100644 --- a/hledger/hledger.m4.md +++ b/hledger/hledger.m4.md @@ -2528,10 +2528,13 @@ Also, the path may have a file type prefix to force a specific file format (as described in [Data formats](#data-formats)): `include timedot:~/notes/2023*.md`. The path may contain [glob patterns] to match multiple files, eg: `include *.journal`. +Note, the current file is always excluded from the matched paths. +(Though include cycles are still possible, and will be reported as an error.) + +The special glob pattern `**` matches any number of path components. +It's useful for searching subdirectories. +Eg to include all .journal files below the current directory: `include **/*.journal`. -The special glob pattern `**/` matches any number of directory parts. -This is not robust; it can hang, and `**/*.journal` is rejected. -But this will work: `include */**/*.journal` (find all .journal files below the current directory). [glob patterns]: https://hackage.haskell.org/package/Glob-0.9.2/docs/System-FilePath-Glob.html#v:compile From 3a03927018b2a00d13258c0388e9f32c73526aee Mon Sep 17 00:00:00 2001 From: Simon Michael Date: Fri, 11 Jul 2025 20:12:13 -0700 Subject: [PATCH 13/28] imp: include: show including file path in debug output --- hledger-lib/Hledger/Read/JournalReader.hs | 39 +++++++++++++---------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/hledger-lib/Hledger/Read/JournalReader.hs b/hledger-lib/Hledger/Read/JournalReader.hs index ee473f153..a3537d28b 100644 --- a/hledger-lib/Hledger/Read/JournalReader.hs +++ b/hledger-lib/Hledger/Read/JournalReader.hs @@ -283,6 +283,12 @@ directivep = (do ] ) "directive" +-- Get the canonical path of the file referenced by this parse position. +-- Symbolic links will be dereferenced. This probably will always succeed +-- (since the parse file's path is probably always absolute). +sourcePosFilePath :: MonadIO m => SourcePos -> m FilePath +sourcePosFilePath = liftIO . canonicalizePath . sourceName + -- | Parse an include directive, and the file(s) it refers to, possibly recursively. -- include's argument is a file path or glob pattern, optionally with a file type prefix. -- ~ at the start is expanded to the user's home directory. @@ -300,7 +306,8 @@ includedirectivep = do lift followingcommentp -- find file(s) let (mprefix,glb) = splitReaderPrefix prefixedglob - when (null $ dbg6 "include: glob pattern" glb) $ + f <- sourcePosFilePath pos + when (null $ dbg6 (f <> " include: glob pattern") glb) $ customFailure $ parseErrorAt off $ "include needs a file path or glob pattern argument" paths <- getFilePaths2 off pos glb let prefixedpaths = case mprefix of @@ -315,23 +322,22 @@ includedirectivep = do -- Find the files matched by a glob pattern, using Glob. -- Uses the current parse context for detecting the current directory and for error messages. _getFilePaths1 :: MonadIO m => Int -> SourcePos -> FilePath -> JournalParser m [FilePath] - _getFilePaths1 parseroff parserpos fileglobpattern = do + _getFilePaths1 off pos fileglobpattern = do -- Expand a ~ at the start of the glob pattern, if any. fileglobpattern' <- lift $ expandHomePath fileglobpattern - `orRethrowIOError` (show parserpos ++ " locating " ++ fileglobpattern) + `orRethrowIOError` (show pos ++ " locating " ++ fileglobpattern) -- Compile the glob pattern. fileglob <- case tryCompileWith compDefault{errorRecovery=False} fileglobpattern' of Right x -> pure x - Left e -> customFailure $ parseErrorAt parseroff $ "Invalid glob pattern: " ++ e - -- Get the directory of the including file. This will be used to resolve relative paths. - let parentfilepath = sourceName parserpos - realparentfilepath <- liftIO $ canonicalizePath parentfilepath -- Follow a symlink. If the path is already absolute, the operation never fails. - let curdir = takeDirectory realparentfilepath + Left e -> customFailure $ parseErrorAt off $ "Invalid glob pattern: " ++ e + -- Get the directory of the including file. + parentfile <- sourcePosFilePath pos + let cwd = takeDirectory parentfile -- Find all matched files, in lexicographic order (the order ls would normally show them) - filepaths <- liftIO $ (dbg6 "include: matched files" . sort) <$> globDir1 fileglob curdir + filepaths <- liftIO $ (dbg6 (parentfile <> " include: matched files") . sort) <$> globDir1 fileglob cwd if (not . null) filepaths then pure filepaths - else customFailure $ parseErrorAt parseroff $ "No files were matched by file pattern: " ++ fileglobpattern + else customFailure $ parseErrorAt off $ "No files were matched by file pattern: " ++ fileglobpattern -- Find the files matched by a glob pattern, if any, using filepattern. -- Uses the current parse context for detecting the current directory and for error messages. @@ -343,10 +349,9 @@ includedirectivep = do expandedglob <- lift $ expandHomePath globpattern `orRethrowIOError` (show pos ++ " locating " ++ globpattern) - -- get the directory of the including file, to resolve relative paths - let parentfilepath = sourceName pos - realparentfilepath <- liftIO $ canonicalizePath parentfilepath -- Follow a symlink. If the path is already absolute, the operation never fails. - let cwd = takeDirectory realparentfilepath + -- get the directory of the including file + parentfile <- sourcePosFilePath pos + let cwd = takeDirectory parentfile -- Find all matched files, in lexicographic order (the order ls would normally show them). -- (This might include the current file.) @@ -361,8 +366,8 @@ includedirectivep = do customFailure $ parseErrorAt off $ "No files were matched by file pattern: " ++ globpattern -- If the current file was matched, exclude it now. - let filepaths' = filter (/= realparentfilepath) filepaths - dbg6IO "include: matched files (excluding current file)" filepaths' + let filepaths' = filter (/= parentfile) filepaths + dbg6IO (parentfile <> " include: matched files (excluding current file)") filepaths' pure filepaths' @@ -380,7 +385,7 @@ includedirectivep = do customFailure $ parseErrorAt off $ "This included file forms a cycle: " ++ filepath -- Read the file's content, or throw an error - childInput <- lift $ readFilePortably filepath `orRethrowIOError` "reading a file failed" + childInput <- lift $ readFilePortably filepath `orRethrowIOError` "failed to read a file" let initChildj = newJournalWithParseStateFrom filepath parentj -- Choose a reader based on the file path prefix or file extension, From 28f60bcf920207155c872e90b7284576d6f48b32 Mon Sep 17 00:00:00 2001 From: Simon Michael Date: Fri, 11 Jul 2025 20:33:38 -0700 Subject: [PATCH 14/28] dev: includedirectivep: refactor --- hledger-lib/Hledger/Read/JournalReader.hs | 50 ++++++++--------------- 1 file changed, 17 insertions(+), 33 deletions(-) diff --git a/hledger-lib/Hledger/Read/JournalReader.hs b/hledger-lib/Hledger/Read/JournalReader.hs index a3537d28b..a6cca690a 100644 --- a/hledger-lib/Hledger/Read/JournalReader.hs +++ b/hledger-lib/Hledger/Read/JournalReader.hs @@ -283,12 +283,6 @@ directivep = (do ] ) "directive" --- Get the canonical path of the file referenced by this parse position. --- Symbolic links will be dereferenced. This probably will always succeed --- (since the parse file's path is probably always absolute). -sourcePosFilePath :: MonadIO m => SourcePos -> m FilePath -sourcePosFilePath = liftIO . canonicalizePath . sourceName - -- | Parse an include directive, and the file(s) it refers to, possibly recursively. -- include's argument is a file path or glob pattern, optionally with a file type prefix. -- ~ at the start is expanded to the user's home directory. @@ -309,7 +303,7 @@ includedirectivep = do f <- sourcePosFilePath pos when (null $ dbg6 (f <> " include: glob pattern") glb) $ customFailure $ parseErrorAt off $ "include needs a file path or glob pattern argument" - paths <- getFilePaths2 off pos glb + paths <- getFilePaths off pos glb let prefixedpaths = case mprefix of Nothing -> paths Just fmt -> map ((show fmt++":")++) paths @@ -318,36 +312,14 @@ includedirectivep = do where - -- XXX keep providing via --old-glob for a bit ? - -- Find the files matched by a glob pattern, using Glob. - -- Uses the current parse context for detecting the current directory and for error messages. - _getFilePaths1 :: MonadIO m => Int -> SourcePos -> FilePath -> JournalParser m [FilePath] - _getFilePaths1 off pos fileglobpattern = do - -- Expand a ~ at the start of the glob pattern, if any. - fileglobpattern' <- lift $ expandHomePath fileglobpattern - `orRethrowIOError` (show pos ++ " locating " ++ fileglobpattern) - -- Compile the glob pattern. - fileglob <- case tryCompileWith compDefault{errorRecovery=False} fileglobpattern' of - Right x -> pure x - Left e -> customFailure $ parseErrorAt off $ "Invalid glob pattern: " ++ e - -- Get the directory of the including file. - parentfile <- sourcePosFilePath pos - let cwd = takeDirectory parentfile - -- Find all matched files, in lexicographic order (the order ls would normally show them) - filepaths <- liftIO $ (dbg6 (parentfile <> " include: matched files") . sort) <$> globDir1 fileglob cwd - if (not . null) filepaths - then pure filepaths - else customFailure $ parseErrorAt off $ "No files were matched by file pattern: " ++ fileglobpattern - -- Find the files matched by a glob pattern, if any, using filepattern. -- Uses the current parse context for detecting the current directory and for error messages. -- This one also ignores all dotted directories (anything under .git/, foo/.secret/, etc.) - getFilePaths2 :: MonadIO m => Int -> SourcePos -> FilePath -> JournalParser m [FilePath] - getFilePaths2 off pos globpattern = do + getFilePaths :: MonadIO m => Int -> SourcePos -> FilePath -> JournalParser m [FilePath] + getFilePaths off pos globpattern = do -- expand a tilde at the start of the glob pattern, or throw an error - expandedglob <- lift $ expandHomePath globpattern - `orRethrowIOError` (show pos ++ " locating " ++ globpattern) + expandedglob <- lift $ expandHomePath globpattern `orRethrowIOError` "failed to expand ~" -- get the directory of the including file parentfile <- sourcePosFilePath pos @@ -355,8 +327,14 @@ includedirectivep = do -- Find all matched files, in lexicographic order (the order ls would normally show them). -- (This might include the current file.) + -- 1. Old implementation, using Glob XXX keep providing this via --old-glob for a bit + _g <- case tryCompileWith compDefault{errorRecovery=False} expandedglob of + Right x -> pure x + Left e -> customFailure $ parseErrorAt off $ "Invalid glob pattern: " ++ e + _filepaths <- liftIO $ (dbg6 (parentfile <> " include: matched files") . sort) <$> globDir1 _g cwd + -- 2. New implementation, using filepattern filepaths <- liftIO $ - map (cwd ) + map (dbg6 "cwd" cwd ) -- . sort -- XXX needed ? <$> getDirectoryFilesIgnore cwd [expandedglob] ["**/.*/**"] @@ -432,6 +410,12 @@ includedirectivep = do ,jincludefilestack = filepath : jincludefilestack j } +-- Get the canonical path of the file referenced by this parse position. +-- Symbolic links will be dereferenced. This probably will always succeed +-- (since the parse file's path is probably always absolute). +sourcePosFilePath :: (MonadIO m) => SourcePos -> m FilePath +sourcePosFilePath = liftIO . canonicalizePath . sourceName + -- | Lift an IO action into the exception monad, rethrowing any IO -- error with the given message prepended. orRethrowIOError :: MonadIO m => IO a -> String -> TextParser m a From 42b9c8b0fb7799ea5609fca3f66fc02cc8519791 Mon Sep 17 00:00:00 2001 From: Simon Michael Date: Mon, 14 Jul 2025 09:56:14 -0700 Subject: [PATCH 15/28] dev: consolidate/expand include tests --- hledger/test/{journal => cli}/a.timeclock | 0 hledger/test/{journal => cli}/b.timedot | 0 hledger/test/cli/multiple-files.test | 2 +- hledger/test/journal/include.test | 105 ------------------ hledger/test/journal/include/.cycle/cycle.j | 1 + .../journal/include/.cycle/cycle2/cycle2.j | 1 + hledger/test/journal/include/.dota.j | 0 hledger/test/journal/include/a.j | 2 + hledger/test/journal/include/a.timeclock | 2 + hledger/test/journal/include/a2.j | 0 hledger/test/journal/include/b.timedot | 3 + .../test/journal/include/b/.dotdir/dotdirb.j | 0 hledger/test/journal/include/b/b.j | 0 .../journal/include/b/bb/.dotdir/dotdirbb.j | 0 hledger/test/journal/include/b/bb/bb.j | 0 hledger/test/journal/include/c/c.j | 0 hledger/test/journal/include/glob1.j | 1 + hledger/test/journal/include/glob2.j | 1 + hledger/test/journal/include/glob3.j | 1 + hledger/test/journal/include/glob4.j | 1 + hledger/test/journal/include/glob5.j | 1 + hledger/test/journal/include/include.test | 98 ++++++++++++++++ hledger/test/journal/include/self.j | 1 + 23 files changed, 114 insertions(+), 106 deletions(-) rename hledger/test/{journal => cli}/a.timeclock (100%) rename hledger/test/{journal => cli}/b.timedot (100%) delete mode 100644 hledger/test/journal/include.test create mode 100644 hledger/test/journal/include/.cycle/cycle.j create mode 100644 hledger/test/journal/include/.cycle/cycle2/cycle2.j create mode 100644 hledger/test/journal/include/.dota.j create mode 100644 hledger/test/journal/include/a.j create mode 100644 hledger/test/journal/include/a.timeclock create mode 100644 hledger/test/journal/include/a2.j create mode 100644 hledger/test/journal/include/b.timedot create mode 100644 hledger/test/journal/include/b/.dotdir/dotdirb.j create mode 100644 hledger/test/journal/include/b/b.j create mode 100644 hledger/test/journal/include/b/bb/.dotdir/dotdirbb.j create mode 100644 hledger/test/journal/include/b/bb/bb.j create mode 100644 hledger/test/journal/include/c/c.j create mode 100644 hledger/test/journal/include/glob1.j create mode 100644 hledger/test/journal/include/glob2.j create mode 100644 hledger/test/journal/include/glob3.j create mode 100644 hledger/test/journal/include/glob4.j create mode 100644 hledger/test/journal/include/glob5.j create mode 100644 hledger/test/journal/include/include.test create mode 100644 hledger/test/journal/include/self.j diff --git a/hledger/test/journal/a.timeclock b/hledger/test/cli/a.timeclock similarity index 100% rename from hledger/test/journal/a.timeclock rename to hledger/test/cli/a.timeclock diff --git a/hledger/test/journal/b.timedot b/hledger/test/cli/b.timedot similarity index 100% rename from hledger/test/journal/b.timedot rename to hledger/test/cli/b.timedot diff --git a/hledger/test/cli/multiple-files.test b/hledger/test/cli/multiple-files.test index e89c905f7..4b7523efa 100644 --- a/hledger/test/cli/multiple-files.test +++ b/hledger/test/cli/multiple-files.test @@ -41,7 +41,7 @@ $ hledger print -f personal.journal -f business.journal -f alias.journal -f pers # ** 3. files can be of different formats -$ hledger print -f personal.journal -f ../journal/a.timeclock -f ../journal/b.timedot +$ hledger print -f personal.journal -f a.timeclock -f b.timedot 2014-01-02 expenses:food $1 assets:cash diff --git a/hledger/test/journal/include.test b/hledger/test/journal/include.test deleted file mode 100644 index 134673164..000000000 --- a/hledger/test/journal/include.test +++ /dev/null @@ -1,105 +0,0 @@ -# * include directive - -# ** 1. nested includes in subdirectories -$ mkdir -p b/c/d ; printf '2010/1/1\n (D) 1\n' >b/c/d/d.journal ; printf '2010/1/1\n (C) 1\ninclude d/d.journal\n' >b/c/c.journal ; printf '2010/1/1\n (B) 1\ninclude c/c.journal\n' >b/b.journal ; printf '2010/1/1\n (A) 1\ninclude b/b.journal\n' >a.journal ; hledger -f a.journal print; rm -rf a.journal bG -2010-01-01 - (A) 1 - -2010-01-01 - (B) 1 - -2010-01-01 - (C) 1 - -2010-01-01 - (D) 1 - ->= 0 - -# ** 2. Including other formats. -< -2016/1/1 - (x) 1 - -include a.timeclock -include b.timedot -$ hledger -f - print -2016-01-01 - (x) 1 - -2016-01-01 * 12:00-16:00 - (a:aa) 4.00h - -2016-01-01 * - (b.bb) 1.00 - ->= - -# ** 3. include glob patterns -< -include *b.journal -$ printf '2018/01/01\n (A) 1\n' >ab.journal; printf '2018/01/01\n (B) 1' >bb.journal; hledger -f - print; rm -f ab.journal bb.journal -2018-01-01 - (A) 1 - -2018-01-01 - (B) 1 - ->= - -# ** 4. include invalid glob patterns -< -include [.journal -$ hledger -f - print ->2 // ->= 1 - -# ** 5. include nonexistent file -< -include doesnotexist.journal -$ hledger -f - print ->2 // ->= 1 - -# ** 6. trailing whitespace after the filename is ignored -< -include a.timeclock -$ hledger -f - check - -# ** 7. a same-line or multi-line following comment is ignored -< -include a.timeclock ; comment - ; comment - ; comment -$ hledger -f - check - -# ** 8. include relative to home -< -include ~/included.journal -$ printf '2018/01/01\n (A) 1\n' >included.journal; HOME="$PWD" hledger -f - print; rm -rf included.journal -2018-01-01 - (A) 1 - ->= 0 - -# The next tests require hard coded file names, so are not concurrent-safe. -# They use different file names so a single concurrent shelltest invocation will be fine. - -# ** 9. test that order of include files is maintained -$ printf 'include _b\n' >_a; touch _b; hledger -f _a stats -v | grep _ | sed -e 's%.*/%%'; rm -rf _a _b -_a -_b - -# ** 10. and with --auto code path -$ printf 'include _d\n=\n' >_c; touch _d; hledger -f _c stats -v --auto | grep _ | sed -e 's%.*/%%'; rm -rf _c _d -_c -_d - -# ** 11. include using old !include directive -< -!include f.journal -$ printf '2018/01/01\n (A) 1\n' >f.journal; hledger -f - print; rm -f f.journal -2018-01-01 - (A) 1 - ->= diff --git a/hledger/test/journal/include/.cycle/cycle.j b/hledger/test/journal/include/.cycle/cycle.j new file mode 100644 index 000000000..91b75d85d --- /dev/null +++ b/hledger/test/journal/include/.cycle/cycle.j @@ -0,0 +1 @@ +include cycle2/cycle2.j diff --git a/hledger/test/journal/include/.cycle/cycle2/cycle2.j b/hledger/test/journal/include/.cycle/cycle2/cycle2.j new file mode 100644 index 000000000..b22e22c1f --- /dev/null +++ b/hledger/test/journal/include/.cycle/cycle2/cycle2.j @@ -0,0 +1 @@ +include ../cycle.j diff --git a/hledger/test/journal/include/.dota.j b/hledger/test/journal/include/.dota.j new file mode 100644 index 000000000..e69de29bb diff --git a/hledger/test/journal/include/a.j b/hledger/test/journal/include/a.j new file mode 100644 index 000000000..c9dcfaff5 --- /dev/null +++ b/hledger/test/journal/include/a.j @@ -0,0 +1,2 @@ +2016-01-01 + (a) 1 diff --git a/hledger/test/journal/include/a.timeclock b/hledger/test/journal/include/a.timeclock new file mode 100644 index 000000000..394258589 --- /dev/null +++ b/hledger/test/journal/include/a.timeclock @@ -0,0 +1,2 @@ +i 2016-01-01 12:00:00 a:aa +o 2016-01-01 16:00:00 diff --git a/hledger/test/journal/include/a2.j b/hledger/test/journal/include/a2.j new file mode 100644 index 000000000..e69de29bb diff --git a/hledger/test/journal/include/b.timedot b/hledger/test/journal/include/b.timedot new file mode 100644 index 000000000..ccbea9760 --- /dev/null +++ b/hledger/test/journal/include/b.timedot @@ -0,0 +1,3 @@ +2016/1/1 +b.bb .... + diff --git a/hledger/test/journal/include/b/.dotdir/dotdirb.j b/hledger/test/journal/include/b/.dotdir/dotdirb.j new file mode 100644 index 000000000..e69de29bb diff --git a/hledger/test/journal/include/b/b.j b/hledger/test/journal/include/b/b.j new file mode 100644 index 000000000..e69de29bb diff --git a/hledger/test/journal/include/b/bb/.dotdir/dotdirbb.j b/hledger/test/journal/include/b/bb/.dotdir/dotdirbb.j new file mode 100644 index 000000000..e69de29bb diff --git a/hledger/test/journal/include/b/bb/bb.j b/hledger/test/journal/include/b/bb/bb.j new file mode 100644 index 000000000..e69de29bb diff --git a/hledger/test/journal/include/c/c.j b/hledger/test/journal/include/c/c.j new file mode 100644 index 000000000..e69de29bb diff --git a/hledger/test/journal/include/glob1.j b/hledger/test/journal/include/glob1.j new file mode 100644 index 000000000..00a8d6018 --- /dev/null +++ b/hledger/test/journal/include/glob1.j @@ -0,0 +1 @@ +include ** diff --git a/hledger/test/journal/include/glob2.j b/hledger/test/journal/include/glob2.j new file mode 100644 index 000000000..5058debd5 --- /dev/null +++ b/hledger/test/journal/include/glob2.j @@ -0,0 +1 @@ +include **.j diff --git a/hledger/test/journal/include/glob3.j b/hledger/test/journal/include/glob3.j new file mode 100644 index 000000000..794c875ba --- /dev/null +++ b/hledger/test/journal/include/glob3.j @@ -0,0 +1 @@ +include */**.j diff --git a/hledger/test/journal/include/glob4.j b/hledger/test/journal/include/glob4.j new file mode 100644 index 000000000..c296029da --- /dev/null +++ b/hledger/test/journal/include/glob4.j @@ -0,0 +1 @@ +include **/*.j diff --git a/hledger/test/journal/include/glob5.j b/hledger/test/journal/include/glob5.j new file mode 100644 index 000000000..f3a2c1b93 --- /dev/null +++ b/hledger/test/journal/include/glob5.j @@ -0,0 +1 @@ +include */**/*.j diff --git a/hledger/test/journal/include/include.test b/hledger/test/journal/include/include.test new file mode 100644 index 000000000..182998455 --- /dev/null +++ b/hledger/test/journal/include/include.test @@ -0,0 +1,98 @@ +# * include directive + +# ** 1. journal, timeclock, and timedot files can be included. +# Trailing whitespace or comments are ignored. +# The order of includes is respected. +include a.j +include b.timedot ; comment +include a.timeclock + ; comment + +$ hledger -f - print +2016-01-01 + (a) 1 + +2016-01-01 * + (b.bb) 1.00 + +2016-01-01 * 12:00-16:00 + (a:aa) 4.00h + +>= + +# ** 2. The old !include spelling still works. +< +!include f.journal +$ printf '2018/01/01\n (A) 1\n' >f.journal; hledger -f - print; rm -f f.journal +2018-01-01 + (A) 1 + +>= + +# ** 3. include with no argument: argument error +< +include +$ hledger -f- files +>2 /include needs a.*argument/ +>=1 + +# ** 4. include a nonexistent file: no files matched error +< +include nosuchfile +$ hledger -f- files +>2 /No files were matched/ +>=1 + +# # ** 5. include a directory: no files matched error +# < +# include b +# $ hledger -f- files +# >2 /No files were matched/ +# >=1 + +# ** 6. Including the current file is harmless, it's ignored automatically. +$ hledger -f self.j files +> /self\.j$/ + +# # ** 7. Including a cycle is harmless, the cycle is pruned automatically. +# $ hledger -f .cycle/cycle.j files +# > /cyc/ +# >=1 + +# ** 8. A leading tilde is expanded to $HOME. +< +include ~/included.journal +$ printf '2018/01/01\n (A) 1\n' >included.journal; HOME="$PWD" hledger -f - print; rm -rf included.journal +2018-01-01 + (A) 1 + +>= 0 + +# # ** 9. Glob patterns. Include a malformed glob -> glob error +# < +# include *** +# $ hledger -f- files +# >2 /malformed/ +# >=1 + +# ** 10. Include invalid glob patterns +< +include [.journal +$ hledger -f - print +>2 // +>= 1 + +# # ** 11. Include ** -> all files +# $ hledger -f glob1.j files + +# # ** 12. Include **.EXT -> ? +# $ hledger -f glob2.j files + +# # ** 13. Include */**.EXT -> ? +# $ hledger -f glob3.j files + +# # ** 14. Include **/*.EXT -> all files in or below cwd with .EXT +# $ hledger -f glob4.j files + +# # ** 15. Include */**/*.EXT -> all files below cwd with .EXT +# $ hledger -f glob5.j files diff --git a/hledger/test/journal/include/self.j b/hledger/test/journal/include/self.j new file mode 100644 index 000000000..428713f5d --- /dev/null +++ b/hledger/test/journal/include/self.j @@ -0,0 +1 @@ +include self.j From 1046f652b14ea1246d4dafe91fb5e13421cd6efa Mon Sep 17 00:00:00 2001 From: Simon Michael Date: Mon, 14 Jul 2025 11:31:36 -0700 Subject: [PATCH 16/28] dev: PrefixedFilePath cleanups And some helpers that weren't needed after all, but maybe in future --- hledger-lib/Hledger/Data/Types.hs | 2 + hledger-lib/Hledger/Read/JournalReader.hs | 50 ++++++++++++++++------- 2 files changed, 37 insertions(+), 15 deletions(-) diff --git a/hledger-lib/Hledger/Data/Types.hs b/hledger-lib/Hledger/Data/Types.hs index 1196197c1..b5f991400 100644 --- a/hledger-lib/Hledger/Data/Types.hs +++ b/hledger-lib/Hledger/Data/Types.hs @@ -672,6 +672,8 @@ data SepFormat | Ssv -- semicolon-separated deriving (Eq, Ord) +-- XXX A little confusion, this is also used to name readers in splitReaderPrefix. +-- readers, input formats, and output formats overlap but are distinct concepts. -- | The id of a data format understood by hledger, eg @journal@ or @csv@. -- The --output-format option selects one of these for output. data StorageFormat diff --git a/hledger-lib/Hledger/Read/JournalReader.hs b/hledger-lib/Hledger/Read/JournalReader.hs index a6cca690a..c94397010 100644 --- a/hledger-lib/Hledger/Read/JournalReader.hs +++ b/hledger-lib/Hledger/Read/JournalReader.hs @@ -167,27 +167,47 @@ findReader Nothing (Just path) = (prefix,path') = splitReaderPrefix path ext = map toLower $ drop 1 $ takeExtension path' --- | A file path optionally prefixed by a reader name and colon --- (journal:, csv:, timedot:, etc.). +-- | A prefix used to specify a particular reader to be used for a file path, +-- overriding the file extension. It is a valid reader name followed by a colon. +-- Eg journal:, csv:, timeclock:, timedot:. +-- type ReaderPrefix = String + +-- | A file path with an optional reader prefix. type PrefixedFilePath = FilePath --- | If a filepath is prefixed by one of the reader names and a colon, --- split that off. Eg "csv:-" -> (Just "csv", "-"). --- These reader prefixes can be used to force a specific reader, --- overriding the file extension. +-- | Separate a file path and its reader prefix, if any. +-- +-- >>> splitReaderPrefix "csv:-" +-- (Just csv,"-") splitReaderPrefix :: PrefixedFilePath -> (Maybe StorageFormat, FilePath) splitReaderPrefix f = let - candidates = [(Just r, drop (length r + 1) f) | r <- readerNames ++ ["ssv","tsv"], (r++":") `isPrefixOf` f] - (strPrefix, newF) = headDef (Nothing, f) candidates + candidates = [(Just r, drop (length r + 1) f) | r <- readerNames ++ ["ssv","tsv"], (r++":") `isPrefixOf` f] + (strPrefix, newF) = headDef (Nothing, f) candidates in case strPrefix of - Just "csv" -> (Just (Sep Csv), newF) - Just "tsv" -> (Just (Sep Tsv), newF) - Just "ssv" -> (Just (Sep Ssv), newF) - Just "journal" -> (Just Journal', newF) - Just "timeclock" -> (Just Timeclock, newF) - Just "timedot" -> (Just Timedot, newF) - _ -> (Nothing, f) + Just "csv" -> (Just (Sep Csv), newF) + Just "tsv" -> (Just (Sep Tsv), newF) + Just "ssv" -> (Just (Sep Ssv), newF) + Just "journal" -> (Just Journal', newF) + Just "timeclock" -> (Just Timeclock, newF) + Just "timedot" -> (Just Timedot, newF) + _ -> (Nothing, f) + +-- -- | Does this file path have a reader prefix ? +-- hasReaderPrefix :: PrefixedFilePath -> Bool +-- hasReaderPrefix = isJust . fst. splitReaderPrefix + +-- -- | Add a reader prefix to a file path, unless it already has one. +-- -- The argument should be a valid reader name. +-- -- +-- -- >>> addReaderPrefix "csv" "a.txt" +-- -- >>> "csv:a.txt" +-- -- >>> addReaderPrefix "csv" "timedot:a.txt" +-- -- >>> "timedot:a.txt" +-- addReaderPrefix :: ReaderPrefix -> FilePath -> PrefixedFilePath +-- addReaderPrefix readername f +-- | hasReaderPrefix f = f +-- | otherwise = readername <> ":" <> f --- ** reader From b4a1add2675b550193e174f8f414eaab5557a133 Mon Sep 17 00:00:00 2001 From: Simon Michael Date: Mon, 14 Jul 2025 17:13:14 -0700 Subject: [PATCH 17/28] imp: include: more robust tests and glob pattern handling This switches from filepattern back to Glob, which is more powerful. New notes, implementation, workarounds and tests. --- hledger-lib/Hledger/Read/JournalReader.hs | 119 ++++++++++++------ hledger-lib/hledger-lib.cabal | 3 - hledger-lib/package.yaml | 1 - hledger/test/journal/include/include.test | 139 ++++++++++++++-------- 4 files changed, 175 insertions(+), 87 deletions(-) diff --git a/hledger-lib/Hledger/Read/JournalReader.hs b/hledger-lib/Hledger/Read/JournalReader.hs index c94397010..e85603b7b 100644 --- a/hledger-lib/Hledger/Read/JournalReader.hs +++ b/hledger-lib/Hledger/Read/JournalReader.hs @@ -73,7 +73,7 @@ where --- ** imports import qualified Control.Monad.Fail as Fail (fail) import qualified Control.Exception as C -import Control.Monad (forM_, when, void, unless) +import Control.Monad (forM_, when, void, unless, filterM) import Control.Monad.IO.Class (MonadIO, liftIO) import Control.Monad.Except (ExceptT(..), runExceptT) import Control.Monad.State.Strict (evalStateT,get,modify',put) @@ -94,7 +94,7 @@ import Text.Megaparsec.Char import Text.Printf import System.FilePath import "Glob" System.FilePath.Glob hiding (match) -import "filepattern" System.FilePattern.Directory +-- import "filepattern" System.FilePattern.Directory import Hledger.Data import Hledger.Read.Common @@ -104,7 +104,8 @@ import qualified Hledger.Read.CsvReader as CsvReader (reader) import qualified Hledger.Read.RulesReader as RulesReader (reader) import qualified Hledger.Read.TimeclockReader as TimeclockReader (reader) import qualified Hledger.Read.TimedotReader as TimedotReader (reader) -import System.Directory (canonicalizePath) +import System.Directory (canonicalizePath, doesFileExist) +import Data.Functor ((<&>)) --- ** doctest setup -- $setup @@ -313,30 +314,66 @@ includedirectivep = do -- save the position off <- getOffset pos <- getSourcePos - -- parse + + -- parse the directive string "include" lift skipNonNewlineSpaces1 prefixedglob <- rstrip . T.unpack <$> takeWhileP Nothing (`notElem` [';','\n']) lift followingcommentp - -- find file(s) let (mprefix,glb) = splitReaderPrefix prefixedglob f <- sourcePosFilePath pos when (null $ dbg6 (f <> " include: glob pattern") glb) $ customFailure $ parseErrorAt off $ "include needs a file path or glob pattern argument" - paths <- getFilePaths off pos glb + + -- Find the file or glob-matched files (just the ones from this include directive), with some IO error checking. + -- Also report whether it was a literal path and not a glob pattern. + (paths, isliteral) <- findMatchedFiles off pos glb + + -- XXX worth the troublel ? + -- Handle duplicates. Some complexities here: + -- + -- If this include directive uses a glob pattern, remove duplicates. + -- Ie if this glob pattern matches any files we have already processed (or the current file), + -- due to multiple includes in sequence or in a cycle, exclude those files so they're not processed again. + -- + -- If this include directive uses a literal file path, don't remove duplicates. + -- Multiple includes in sequence will cause the included file to be processed multiple times. + -- Multiple includes forming a cycle will be detected and reported as an error in parseIncludedFile. + let paths' = if isliteral then paths else filter (const True) paths + + -- if there was a reader prefix, apply it to all the file paths let prefixedpaths = case mprefix of - Nothing -> paths - Just fmt -> map ((show fmt++":")++) paths - -- parse them inline + Nothing -> paths' + Just fmt -> map ((show fmt++":")++) paths' + + -- parse each one, as if inlined here forM_ prefixedpaths $ parseIncludedFile off pos where - -- Find the files matched by a glob pattern, if any, using filepattern. + -- | Find the files matched by a literal path or a glob pattern. -- Uses the current parse context for detecting the current directory and for error messages. - -- This one also ignores all dotted directories (anything under .git/, foo/.secret/, etc.) - getFilePaths :: MonadIO m => Int -> SourcePos -> FilePath -> JournalParser m [FilePath] - getFilePaths off pos globpattern = do + -- Expands a leading tilde to the user's home directory. + -- Glob patterns at the start of a path component exclude dot-named files and directories. + -- + -- Checks if any matched paths are directories and excludes those. + -- Converts all matched paths to their canonical form. + findMatchedFiles :: MonadIO m => Int -> SourcePos -> FilePath -> JournalParser m ([FilePath], Bool) + findMatchedFiles off pos globpattern = do + -- Some notes about the Glob library: + -- ---------------------------------- + -- It does not expand tilde. + -- It does not canonicalise paths. + -- The results are not in any particular order. + -- The results can include directories. + -- DIRPAT/ is equivalent to DIRPAT, except results will end with // (double slash). + -- . or .. at the start of a pattern can match the current or parent directories. + -- * matches any number of characters in a file or directory name. + -- * at the start of a file name ignores dot-named files and directories, by default. + -- ** (or any number of consecutive *'s) not followed by slash is equivalent to *. + -- A **/ component matches any number of directory parts. + -- A **/ ignores dot-named directories in its starting and ending directories, by default. + -- But **/ does search intermediate dot-named directories. Eg it can find a/.b/c. -- expand a tilde at the start of the glob pattern, or throw an error expandedglob <- lift $ expandHomePath globpattern `orRethrowIOError` "failed to expand ~" @@ -345,29 +382,43 @@ includedirectivep = do parentfile <- sourcePosFilePath pos let cwd = takeDirectory parentfile - -- Find all matched files, in lexicographic order (the order ls would normally show them). - -- (This might include the current file.) - -- 1. Old implementation, using Glob XXX keep providing this via --old-glob for a bit - _g <- case tryCompileWith compDefault{errorRecovery=False} expandedglob of - Right x -> pure x + -- Compile as a Glob Pattern. Can throw an error. + g <- case tryCompileWith compDefault{errorRecovery=False} expandedglob of Left e -> customFailure $ parseErrorAt off $ "Invalid glob pattern: " ++ e - _filepaths <- liftIO $ (dbg6 (parentfile <> " include: matched files") . sort) <$> globDir1 _g cwd - -- 2. New implementation, using filepattern - filepaths <- liftIO $ - map (dbg6 "cwd" cwd ) - -- . sort -- XXX needed ? - <$> - getDirectoryFilesIgnore cwd [expandedglob] ["**/.*/**"] + Right _ | "***" `isInfixOf` expandedglob -> customFailure $ parseErrorAt off $ "Invalid glob pattern: too many stars" + Right x -> pure x - -- Throw an error if no files (not even the current file) were matched. - when (null filepaths) $ - customFailure $ parseErrorAt off $ "No files were matched by file pattern: " ++ globpattern + -- Find all matched paths, in lexicographic order (the order ls would normally show them). + -- These might include directories or the current file. + paths <- liftIO $ + -- (dbg6 (parentfile <> " include: matched paths") . sort) <$> + globDir1 g cwd - -- If the current file was matched, exclude it now. - let filepaths' = filter (/= parentfile) filepaths - dbg6IO (parentfile <> " include: matched files (excluding current file)") filepaths' + -- Exclude any directories or symlinks to directories, and canonicalise + files <- liftIO $ + filterM doesFileExist paths + >>= mapM canonicalizePath + <&> (dbg6 (parentfile <> " include: matched files") . sort) - pure filepaths' + -- If a glob was used: exclude any intermediate dot directories that were searched. + -- As noted above, while **/ ignores dot dirs in the starting and ending dirs, + -- it does search dot dirs in between those two (something that should be fixed in Glob ?). + -- This seems likely to be inconvenient, eg when trying to avoid .git directories in subrepos. + -- So as an imperfect workaround: when using any glob, exclude all paths involving dot dirs. + -- Unfortunately this means valid globs like .dotdir/* will not succeed; only a literal + -- .dotdir/foo would work there. + let + files' = if isLiteral g then files else filter (not.hasdotdir) files + where + hasdotdir p = any isdotdir $ splitPath p + where + isdotdir c = "." `isPrefixOf` c && "/" `isSuffixOf` c + + -- Throw an error if no files were matched. + when (null files') $ + customFailure $ parseErrorAt off $ "No files were matched by glob pattern: " ++ globpattern + + return (files', isLiteral g) -- Parse the given included file (and any deeper includes, recursively) -- as if it was inlined in the current (parent) file. @@ -1195,8 +1246,8 @@ tests_JournalReader = testGroup "JournalReader" [ assertParse ignoredpricecommoditydirectivep "N $\n" ,testGroup "includedirectivep" [ - testCase "include" $ assertParseErrorE includedirectivep "include nosuchfile\n" "No existing files match pattern: nosuchfile" - ,testCase "glob" $ assertParseErrorE includedirectivep "include nosuchfile*\n" "No existing files match pattern: nosuchfile*" + testCase "include" $ assertParseErrorE includedirectivep "include nosuchfile\n" "No files were matched by glob pattern: nosuchfile" + ,testCase "glob" $ assertParseErrorE includedirectivep "include nosuchfile*\n" "No files were matched by glob pattern: nosuchfile*" ] ,testCase "marketpricedirectivep" $ assertParseEq marketpricedirectivep diff --git a/hledger-lib/hledger-lib.cabal b/hledger-lib/hledger-lib.cabal index 0a1328956..c3bf27760 100644 --- a/hledger-lib/hledger-lib.cabal +++ b/hledger-lib/hledger-lib.cabal @@ -148,7 +148,6 @@ library , extra >=1.7.11 , file-embed >=0.0.10 , filepath - , filepattern >=0.1.3 , hashtables >=1.2.3.1 , lucid , megaparsec >=7.0.0 && <9.8 @@ -209,7 +208,6 @@ test-suite doctest , extra >=1.7.11 , file-embed >=0.0.10 , filepath - , filepattern >=0.1.3 , hashtables >=1.2.3.1 , lucid , megaparsec >=7.0.0 && <9.8 @@ -271,7 +269,6 @@ test-suite unittest , extra >=1.7.11 , file-embed >=0.0.10 , filepath - , filepattern >=0.1.3 , hashtables >=1.2.3.1 , hledger-lib , lucid diff --git a/hledger-lib/package.yaml b/hledger-lib/package.yaml index eec388e40..e58cd9041 100644 --- a/hledger-lib/package.yaml +++ b/hledger-lib/package.yaml @@ -94,7 +94,6 @@ dependencies: - utf8-string >=0.3.5 - extra >=1.7.11 - Glob >= 0.9 -- filepattern >=0.1.3 ghc-options: - -Wall diff --git a/hledger/test/journal/include/include.test b/hledger/test/journal/include/include.test index 182998455..a6d96b25b 100644 --- a/hledger/test/journal/include/include.test +++ b/hledger/test/journal/include/include.test @@ -29,37 +29,7 @@ $ printf '2018/01/01\n (A) 1\n' >f.journal; hledger -f - print; rm -f f.journa >= -# ** 3. include with no argument: argument error -< -include -$ hledger -f- files ->2 /include needs a.*argument/ ->=1 - -# ** 4. include a nonexistent file: no files matched error -< -include nosuchfile -$ hledger -f- files ->2 /No files were matched/ ->=1 - -# # ** 5. include a directory: no files matched error -# < -# include b -# $ hledger -f- files -# >2 /No files were matched/ -# >=1 - -# ** 6. Including the current file is harmless, it's ignored automatically. -$ hledger -f self.j files -> /self\.j$/ - -# # ** 7. Including a cycle is harmless, the cycle is pruned automatically. -# $ hledger -f .cycle/cycle.j files -# > /cyc/ -# >=1 - -# ** 8. A leading tilde is expanded to $HOME. +# ** 3. A leading tilde is expanded to $HOME. < include ~/included.journal $ printf '2018/01/01\n (A) 1\n' >included.journal; HOME="$PWD" hledger -f - print; rm -rf included.journal @@ -68,31 +38,102 @@ $ printf '2018/01/01\n (A) 1\n' >included.journal; HOME="$PWD" hledger -f - pr >= 0 -# # ** 9. Glob patterns. Include a malformed glob -> glob error -# < -# include *** -# $ hledger -f- files -# >2 /malformed/ -# >=1 +# ** 4. include with no argument -> argument error +< +include +$ hledger -f- files +>2 /include needs a.*argument/ +>=1 -# ** 10. Include invalid glob patterns +# ** 5. include a nonexistent file -> no files matched error +< +include nosuchfile +$ hledger -f- files +>2 /No files were matched/ +>=1 + +# ** 6. Including a directory literally -> no files matched error +< +include b +$ hledger -f- files +>2 /No files were matched/ +>=1 + +# ** 7. Include invalid glob patterns -> invalid glob error < include [.journal $ hledger -f - print ->2 // +>2 /Invalid glob/ >= 1 -# # ** 11. Include ** -> all files -# $ hledger -f glob1.j files +# ** 8. Three or more *'s -> glob error +< +include *** +$ hledger -f- files +>2 /Invalid glob/ +>=1 -# # ** 12. Include **.EXT -> ? -# $ hledger -f glob2.j files +# ** 9. Including the current file literally -> cycle error. +$ hledger -f self.j files +>2 /cycle/ +>=1 -# # ** 13. Include */**.EXT -> ? -# $ hledger -f glob3.j files +# ** 10. Including the current file via glob -> cycle error. +$ hledger -f selfglob.j files +>2 /cycle/ +>=1 -# # ** 14. Include **/*.EXT -> all files in or below cwd with .EXT -# $ hledger -f glob4.j files +# ** 11. Including a cycle, all literally -> cycle error +$ hledger -f .cycle/cycle.j files +>2 /cycle/ +>=1 -# # ** 15. Include */**/*.EXT -> all files below cwd with .EXT -# $ hledger -f glob5.j files +# ** 12. Including a cycle, involving globs -> cycle error +$ hledger -f .cycle/cycleglob.j files +>2 /cycle/ +>=1 + +# ** 13. Include ** -> cycle error (includes current file) +$ hledger -f glob1.j files +>2 /cycle/ +>=1 + +# ** 14. Include **.j -> cycle error (includes current file) +$ hledger -f glob2.j files +>2 /cycle/ +>=1 + +# ** 15. Include */**.j -> all .j files in subdirectories (**.j is same as *.j) +$ hledger -f glob3.j files | sed -E 's|.*hledger/test/journal/include/||' +glob3.j +b/b.j +c/c.j + +# ** 16. Include **/*.j -> cycle error (includes current file) +$ hledger -f glob4.j files +>2 /cycle/ +>=1 + +# ** 17. Include */**/*.j -> all non-dot .j files in or below non-dot subdirectories. +< +include */**/*.j +$ hledger -f - files | sed -E 's|.*hledger/test/journal/include/||' +- +b/b.j +b/bb/bb.j +c/c.j + +# ** 18. To avoid intermediate dot dirs in the above, we exclude all glob-matched paths involving dot dirs. +# So this does not find b/bb/.dotdir/dotdirbb.j, unfortunately: +< +include b/.dotdir/*.j +$ hledger -f - files | sed -E 's|.*hledger/test/journal/include/||' +>2 /No files were matched/ +# sed hides the non-zero exit code + +# ** 19. Only a literal path can find it. +< +include b/.dotdir/dotdirb.j +$ hledger -f - files | sed -E 's|.*hledger/test/journal/include/||' +- +b/.dotdir/dotdirb.j From 460ae288263de9116403853482610fcde2e51aa4 Mon Sep 17 00:00:00 2001 From: Simon Michael Date: Mon, 14 Jul 2025 21:14:01 -0700 Subject: [PATCH 18/28] imp: include: globs exclude current file; more cleanup --- hledger-lib/Hledger/Read/JournalReader.hs | 34 ++++++++++++++--------- hledger/test/journal/include/a.j | 2 ++ hledger/test/journal/include/glob1.j | 1 - hledger/test/journal/include/glob5.j | 1 - hledger/test/journal/include/include.test | 19 ++++++------- 5 files changed, 32 insertions(+), 25 deletions(-) delete mode 100644 hledger/test/journal/include/glob1.j delete mode 100644 hledger/test/journal/include/glob5.j diff --git a/hledger-lib/Hledger/Read/JournalReader.hs b/hledger-lib/Hledger/Read/JournalReader.hs index e85603b7b..7991bd252 100644 --- a/hledger-lib/Hledger/Read/JournalReader.hs +++ b/hledger-lib/Hledger/Read/JournalReader.hs @@ -326,20 +326,19 @@ includedirectivep = do customFailure $ parseErrorAt off $ "include needs a file path or glob pattern argument" -- Find the file or glob-matched files (just the ones from this include directive), with some IO error checking. - -- Also report whether it was a literal path and not a glob pattern. - (paths, isliteral) <- findMatchedFiles off pos glb + -- Also report whether a glob pattern was used, and not just a literal file path. + -- (paths, isglob) <- findMatchedFiles off pos glb + paths <- findMatchedFiles off pos glb - -- XXX worth the troublel ? + -- XXX worth the trouble ? no -- Handle duplicates. Some complexities here: - -- -- If this include directive uses a glob pattern, remove duplicates. -- Ie if this glob pattern matches any files we have already processed (or the current file), -- due to multiple includes in sequence or in a cycle, exclude those files so they're not processed again. - -- -- If this include directive uses a literal file path, don't remove duplicates. -- Multiple includes in sequence will cause the included file to be processed multiple times. -- Multiple includes forming a cycle will be detected and reported as an error in parseIncludedFile. - let paths' = if isliteral then paths else filter (const True) paths + let paths' = paths -- if isglob then filter (const True) paths else paths -- if there was a reader prefix, apply it to all the file paths let prefixedpaths = case mprefix of @@ -358,7 +357,8 @@ includedirectivep = do -- -- Checks if any matched paths are directories and excludes those. -- Converts all matched paths to their canonical form. - findMatchedFiles :: MonadIO m => Int -> SourcePos -> FilePath -> JournalParser m ([FilePath], Bool) + -- findMatchedFiles :: MonadIO m => Int -> SourcePos -> FilePath -> JournalParser m ([FilePath], Bool) + findMatchedFiles :: MonadIO m => Int -> SourcePos -> FilePath -> JournalParser m [FilePath] findMatchedFiles off pos globpattern = do -- Some notes about the Glob library: -- ---------------------------------- @@ -387,6 +387,7 @@ includedirectivep = do Left e -> customFailure $ parseErrorAt off $ "Invalid glob pattern: " ++ e Right _ | "***" `isInfixOf` expandedglob -> customFailure $ parseErrorAt off $ "Invalid glob pattern: too many stars" Right x -> pure x + let isglob = not $ isLiteral g -- Find all matched paths, in lexicographic order (the order ls would normally show them). -- These might include directories or the current file. @@ -394,13 +395,13 @@ includedirectivep = do -- (dbg6 (parentfile <> " include: matched paths") . sort) <$> globDir1 g cwd - -- Exclude any directories or symlinks to directories, and canonicalise + -- Exclude any directories or symlinks to directories, and canonicalise, and sort. files <- liftIO $ filterM doesFileExist paths >>= mapM canonicalizePath - <&> (dbg6 (parentfile <> " include: matched files") . sort) + <&> sort - -- If a glob was used: exclude any intermediate dot directories that were searched. + -- If a glob was used, exclude any intermediate dot directories that were searched. -- As noted above, while **/ ignores dot dirs in the starting and ending dirs, -- it does search dot dirs in between those two (something that should be fixed in Glob ?). -- This seems likely to be inconvenient, eg when trying to avoid .git directories in subrepos. @@ -408,17 +409,24 @@ includedirectivep = do -- Unfortunately this means valid globs like .dotdir/* will not succeed; only a literal -- .dotdir/foo would work there. let - files' = if isLiteral g then files else filter (not.hasdotdir) files + files2 = (if isglob then filter (not.hasdotdir) else id) files where hasdotdir p = any isdotdir $ splitPath p where isdotdir c = "." `isPrefixOf` c && "/" `isSuffixOf` c -- Throw an error if no files were matched. - when (null files') $ + when (null files2) $ customFailure $ parseErrorAt off $ "No files were matched by glob pattern: " ++ globpattern - return (files', isLiteral g) + -- If a glob was used, exclude the current file, for convenience. + let + files3 = + dbg6 (parentfile <> " include: matched files" <> if isglob then " (excluding current file)" else "") $ + (if isglob then filter (/= parentfile) else id) files2 + + -- return (files3, isglob) + return files3 -- Parse the given included file (and any deeper includes, recursively) -- as if it was inlined in the current (parent) file. diff --git a/hledger/test/journal/include/a.j b/hledger/test/journal/include/a.j index c9dcfaff5..b8edfa9da 100644 --- a/hledger/test/journal/include/a.j +++ b/hledger/test/journal/include/a.j @@ -1,2 +1,4 @@ +include a*.j + 2016-01-01 (a) 1 diff --git a/hledger/test/journal/include/glob1.j b/hledger/test/journal/include/glob1.j deleted file mode 100644 index 00a8d6018..000000000 --- a/hledger/test/journal/include/glob1.j +++ /dev/null @@ -1 +0,0 @@ -include ** diff --git a/hledger/test/journal/include/glob5.j b/hledger/test/journal/include/glob5.j deleted file mode 100644 index f3a2c1b93..000000000 --- a/hledger/test/journal/include/glob5.j +++ /dev/null @@ -1 +0,0 @@ -include */**/*.j diff --git a/hledger/test/journal/include/include.test b/hledger/test/journal/include/include.test index a6d96b25b..6ea952569 100644 --- a/hledger/test/journal/include/include.test +++ b/hledger/test/journal/include/include.test @@ -78,10 +78,9 @@ $ hledger -f self.j files >2 /cycle/ >=1 -# ** 10. Including the current file via glob -> cycle error. -$ hledger -f selfglob.j files ->2 /cycle/ ->=1 +# ** 10. Including the current file via glob -> harmless, globs ignore current file. +$ hledger -f selfglob.j files | sed -E 's|.*hledger/test/journal/include/||' +selfglob.j # ** 11. Including a cycle, all literally -> cycle error $ hledger -f .cycle/cycle.j files @@ -93,12 +92,12 @@ $ hledger -f .cycle/cycleglob.j files >2 /cycle/ >=1 -# ** 13. Include ** -> cycle error (includes current file) -$ hledger -f glob1.j files ->2 /cycle/ ->=1 +# ** 13. Glob patterns ignore the current file (once). +$ hledger -f a.j files | sed -E 's|.*hledger/test/journal/include/||' +a.j +a2.j -# ** 14. Include **.j -> cycle error (includes current file) +# ** 14. Include **.j -> cycle error (globs ignore current file, but other files include it) $ hledger -f glob2.j files >2 /cycle/ >=1 @@ -109,7 +108,7 @@ glob3.j b/b.j c/c.j -# ** 16. Include **/*.j -> cycle error (includes current file) +# ** 16. Include **/*.j -> cycle error (too many mutual includes) $ hledger -f glob4.j files >2 /cycle/ >=1 From 2dcfe22c89ccd46e0a17c0e5c0c4030c6721d20b Mon Sep 17 00:00:00 2001 From: Simon Michael Date: Tue, 15 Jul 2025 10:15:50 -0700 Subject: [PATCH 19/28] imp: include: report ** without / as an error, for clarity --- hledger-lib/Hledger/Read/JournalReader.hs | 10 ++++-- hledger/test/journal/include/glob2.j | 1 - hledger/test/journal/include/glob3.j | 1 - hledger/test/journal/include/glob4.j | 1 - hledger/test/journal/include/include.test | 41 +++++++++-------------- 5 files changed, 23 insertions(+), 31 deletions(-) delete mode 100644 hledger/test/journal/include/glob2.j delete mode 100644 hledger/test/journal/include/glob3.j delete mode 100644 hledger/test/journal/include/glob4.j diff --git a/hledger-lib/Hledger/Read/JournalReader.hs b/hledger-lib/Hledger/Read/JournalReader.hs index 7991bd252..2032cc896 100644 --- a/hledger-lib/Hledger/Read/JournalReader.hs +++ b/hledger-lib/Hledger/Read/JournalReader.hs @@ -382,10 +382,14 @@ includedirectivep = do parentfile <- sourcePosFilePath pos let cwd = takeDirectory parentfile - -- Compile as a Glob Pattern. Can throw an error. + -- Compile as a Glob Pattern (and do some extra error checking). Can throw an error. g <- case tryCompileWith compDefault{errorRecovery=False} expandedglob of - Left e -> customFailure $ parseErrorAt off $ "Invalid glob pattern: " ++ e - Right _ | "***" `isInfixOf` expandedglob -> customFailure $ parseErrorAt off $ "Invalid glob pattern: too many stars" + Left e -> + customFailure $ parseErrorAt off $ "Invalid glob pattern: " ++ e + Right _ | "***" `isInfixOf` expandedglob -> + customFailure $ parseErrorAt off $ "Invalid glob pattern: too many stars, use * or **/" + Right _ | regexMatch (toRegex' "\\*\\*[^/]") expandedglob -> + customFailure $ parseErrorAt off $ "Invalid glob pattern: double star requires slash, use **/" Right x -> pure x let isglob = not $ isLiteral g diff --git a/hledger/test/journal/include/glob2.j b/hledger/test/journal/include/glob2.j deleted file mode 100644 index 5058debd5..000000000 --- a/hledger/test/journal/include/glob2.j +++ /dev/null @@ -1 +0,0 @@ -include **.j diff --git a/hledger/test/journal/include/glob3.j b/hledger/test/journal/include/glob3.j deleted file mode 100644 index 794c875ba..000000000 --- a/hledger/test/journal/include/glob3.j +++ /dev/null @@ -1 +0,0 @@ -include */**.j diff --git a/hledger/test/journal/include/glob4.j b/hledger/test/journal/include/glob4.j deleted file mode 100644 index c296029da..000000000 --- a/hledger/test/journal/include/glob4.j +++ /dev/null @@ -1 +0,0 @@ -include **/*.j diff --git a/hledger/test/journal/include/include.test b/hledger/test/journal/include/include.test index 6ea952569..d0f5a5b8d 100644 --- a/hledger/test/journal/include/include.test +++ b/hledger/test/journal/include/include.test @@ -66,54 +66,45 @@ $ hledger -f - print >2 /Invalid glob/ >= 1 -# ** 8. Three or more *'s -> glob error +# ** 8. Two *'s without / -> invalid glob error +< +include ** +$ hledger -f- files +>2 /Invalid glob/ +>=1 + +# ** 9. Three or more *'s -> invalid glob error < include *** $ hledger -f- files >2 /Invalid glob/ >=1 -# ** 9. Including the current file literally -> cycle error. +# ** 10. Including the current file literally -> cycle error. $ hledger -f self.j files >2 /cycle/ >=1 -# ** 10. Including the current file via glob -> harmless, globs ignore current file. +# ** 11. Including the current file via glob -> harmless, globs ignore current file. $ hledger -f selfglob.j files | sed -E 's|.*hledger/test/journal/include/||' selfglob.j -# ** 11. Including a cycle, all literally -> cycle error +# ** 12. Including a cycle, all literally -> cycle error $ hledger -f .cycle/cycle.j files >2 /cycle/ >=1 -# ** 12. Including a cycle, involving globs -> cycle error +# ** 13. Including a cycle, involving globs -> cycle error $ hledger -f .cycle/cycleglob.j files >2 /cycle/ >=1 -# ** 13. Glob patterns ignore the current file (once). +# ** 14. Glob patterns ignore the current file (once). $ hledger -f a.j files | sed -E 's|.*hledger/test/journal/include/||' a.j a2.j -# ** 14. Include **.j -> cycle error (globs ignore current file, but other files include it) -$ hledger -f glob2.j files ->2 /cycle/ ->=1 - -# ** 15. Include */**.j -> all .j files in subdirectories (**.j is same as *.j) -$ hledger -f glob3.j files | sed -E 's|.*hledger/test/journal/include/||' -glob3.j -b/b.j -c/c.j - -# ** 16. Include **/*.j -> cycle error (too many mutual includes) -$ hledger -f glob4.j files ->2 /cycle/ ->=1 - -# ** 17. Include */**/*.j -> all non-dot .j files in or below non-dot subdirectories. +# ** 15. Include */**/*.j -> all non-dot .j files in or below non-dot subdirectories. < include */**/*.j $ hledger -f - files | sed -E 's|.*hledger/test/journal/include/||' @@ -122,7 +113,7 @@ b/b.j b/bb/bb.j c/c.j -# ** 18. To avoid intermediate dot dirs in the above, we exclude all glob-matched paths involving dot dirs. +# ** 16. To avoid intermediate dot dirs in the above, we exclude all glob-matched paths involving dot dirs. # So this does not find b/bb/.dotdir/dotdirbb.j, unfortunately: < include b/.dotdir/*.j @@ -130,7 +121,7 @@ $ hledger -f - files | sed -E 's|.*hledger/test/journal/include/||' >2 /No files were matched/ # sed hides the non-zero exit code -# ** 19. Only a literal path can find it. +# ** 17. Only a literal path can find it. < include b/.dotdir/dotdirb.j $ hledger -f - files | sed -E 's|.*hledger/test/journal/include/||' From 3741f9f03022a15c5f5d4fc13f9734df1dd231dc Mon Sep 17 00:00:00 2001 From: Simon Michael Date: Tue, 15 Jul 2025 10:50:41 -0700 Subject: [PATCH 20/28] fix: include: report read failures with correct line number --- hledger-lib/Hledger/Read/JournalReader.hs | 1 + 1 file changed, 1 insertion(+) diff --git a/hledger-lib/Hledger/Read/JournalReader.hs b/hledger-lib/Hledger/Read/JournalReader.hs index 2032cc896..e77afdf64 100644 --- a/hledger-lib/Hledger/Read/JournalReader.hs +++ b/hledger-lib/Hledger/Read/JournalReader.hs @@ -346,6 +346,7 @@ includedirectivep = do Just fmt -> map ((show fmt++":")++) paths' -- parse each one, as if inlined here + setOffset off forM_ prefixedpaths $ parseIncludedFile off pos where From 8215f19baa9261827b3f7062e924276f7e89be3e Mon Sep 17 00:00:00 2001 From: Simon Michael Date: Tue, 15 Jul 2025 10:50:34 -0700 Subject: [PATCH 21/28] dev: include: cleanup --- hledger-lib/Hledger/Read/JournalReader.hs | 7 +++---- hledger/test/journal/include/.cycle/cycleglob.j | 1 + hledger/test/journal/include/selfglob.j | 1 + 3 files changed, 5 insertions(+), 4 deletions(-) create mode 100644 hledger/test/journal/include/.cycle/cycleglob.j create mode 100644 hledger/test/journal/include/selfglob.j diff --git a/hledger-lib/Hledger/Read/JournalReader.hs b/hledger-lib/Hledger/Read/JournalReader.hs index e77afdf64..ef7209161 100644 --- a/hledger-lib/Hledger/Read/JournalReader.hs +++ b/hledger-lib/Hledger/Read/JournalReader.hs @@ -71,7 +71,6 @@ module Hledger.Read.JournalReader ( where --- ** imports -import qualified Control.Monad.Fail as Fail (fail) import qualified Control.Exception as C import Control.Monad (forM_, when, void, unless, filterM) import Control.Monad.IO.Class (MonadIO, liftIO) @@ -387,10 +386,10 @@ includedirectivep = do g <- case tryCompileWith compDefault{errorRecovery=False} expandedglob of Left e -> customFailure $ parseErrorAt off $ "Invalid glob pattern: " ++ e + Right _ | regexMatch (toRegex' "\\*\\*[^/]") expandedglob -> + customFailure $ parseErrorAt off $ "Invalid glob pattern: double star requires slash, use * or **/" Right _ | "***" `isInfixOf` expandedglob -> customFailure $ parseErrorAt off $ "Invalid glob pattern: too many stars, use * or **/" - Right _ | regexMatch (toRegex' "\\*\\*[^/]") expandedglob -> - customFailure $ parseErrorAt off $ "Invalid glob pattern: double star requires slash, use **/" Right x -> pure x let isglob = not $ isLiteral g @@ -507,7 +506,7 @@ orRethrowIOError io msg = do eResult <- liftIO $ (Right <$> io) `C.catch` \(e::C.IOException) -> pure $ Left $ printf "%s:\n%s" msg (show e) case eResult of Right res -> pure res - Left errMsg -> Fail.fail errMsg + Left errMsg -> fail errMsg -- Parse an account directive, adding its info to the journal's -- list of account declarations. diff --git a/hledger/test/journal/include/.cycle/cycleglob.j b/hledger/test/journal/include/.cycle/cycleglob.j new file mode 100644 index 000000000..bd09962f6 --- /dev/null +++ b/hledger/test/journal/include/.cycle/cycleglob.j @@ -0,0 +1 @@ +include */*.j diff --git a/hledger/test/journal/include/selfglob.j b/hledger/test/journal/include/selfglob.j new file mode 100644 index 000000000..0323e0b54 --- /dev/null +++ b/hledger/test/journal/include/selfglob.j @@ -0,0 +1 @@ +include selfg*.j From b35dcfda28eb71367b9a056bdd96541ef00aa346 Mon Sep 17 00:00:00 2001 From: Simon Michael Date: Tue, 15 Jul 2025 11:49:17 -0700 Subject: [PATCH 22/28] imp: include: give a clearer error for trying to include a csv file And note that other read errors may be unclear when occurring in included files. --- hledger-lib/Hledger/Read/CsvReader.hs | 7 ++++++- hledger/test/errors/csvnoinclude.test | 3 +-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/hledger-lib/Hledger/Read/CsvReader.hs b/hledger-lib/Hledger/Read/CsvReader.hs index c957865ab..747c92138 100644 --- a/hledger-lib/Hledger/Read/CsvReader.hs +++ b/hledger-lib/Hledger/Read/CsvReader.hs @@ -46,7 +46,12 @@ reader sep = Reader {rFormat = Sep sep ,rExtensions = [show sep] ,rReadFn = parse sep - ,rParser = error' "sorry, CSV files can't be included yet" -- PARTIAL: + ,rParser = fail "sorry, CSV files can't be included yet" -- PARTIAL: + -- This unnecessarily shows the CSV file's first line in the error message, + -- but gives a more useful message than just calling error'. + -- XXX Note every call to error' in Hledger.Read.* is potentially a similar problem - + -- the error message is good enough when the file was specified directly by the user, + -- but not good if it was loaded by a possibly long chain of include directives. } -- | Parse and post-process a "Journal" from CSV data, or give an error. diff --git a/hledger/test/errors/csvnoinclude.test b/hledger/test/errors/csvnoinclude.test index eb04bb3c0..1eca09bf6 100644 --- a/hledger/test/errors/csvnoinclude.test +++ b/hledger/test/errors/csvnoinclude.test @@ -1,4 +1,3 @@ $$$ hledger check -f csvnoinclude.j ->>>2 /Error: sorry, CSV files can't be included yet -/ +>>>2 /CSV files can't be included yet/ >>>= 1 From 5ec770badde9eb5a0b6cf93ee61dbe6b9fde7b40 Mon Sep 17 00:00:00 2001 From: Simon Michael Date: Tue, 15 Jul 2025 15:05:28 -0700 Subject: [PATCH 23/28] imp: include: more flexible **; show the correct line in read errors --- hledger-lib/Hledger/Read/JournalReader.hs | 60 ++++++++++++----------- hledger/test/journal/include/include.test | 30 ++++++------ 2 files changed, 47 insertions(+), 43 deletions(-) diff --git a/hledger-lib/Hledger/Read/JournalReader.hs b/hledger-lib/Hledger/Read/JournalReader.hs index ef7209161..abf65fa27 100644 --- a/hledger-lib/Hledger/Read/JournalReader.hs +++ b/hledger-lib/Hledger/Read/JournalReader.hs @@ -344,7 +344,8 @@ includedirectivep = do Nothing -> paths' Just fmt -> map ((show fmt++":")++) paths' - -- parse each one, as if inlined here + -- Parse each one, as if inlined here. + -- Reset the position to the `include` line, for error messages. setOffset off forM_ prefixedpaths $ parseIncludedFile off pos @@ -353,24 +354,23 @@ includedirectivep = do -- | Find the files matched by a literal path or a glob pattern. -- Uses the current parse context for detecting the current directory and for error messages. -- Expands a leading tilde to the user's home directory. - -- Glob patterns at the start of a path component exclude dot-named files and directories. - -- + -- Converts ** without a slash to **/*, like zsh's GLOB_STAR_SHORT, so ** also matches file name parts. + -- Glob patterns at the start of a path component will exclude dot-named files and directories. -- Checks if any matched paths are directories and excludes those. -- Converts all matched paths to their canonical form. - -- findMatchedFiles :: MonadIO m => Int -> SourcePos -> FilePath -> JournalParser m ([FilePath], Bool) findMatchedFiles :: MonadIO m => Int -> SourcePos -> FilePath -> JournalParser m [FilePath] findMatchedFiles off pos globpattern = do - -- Some notes about the Glob library: - -- ---------------------------------- + + -- Some notes about the Glob library that we use (related: https://github.com/Deewiant/glob/issues/49): -- It does not expand tilde. -- It does not canonicalise paths. -- The results are not in any particular order. -- The results can include directories. -- DIRPAT/ is equivalent to DIRPAT, except results will end with // (double slash). - -- . or .. at the start of a pattern can match the current or parent directories. - -- * matches any number of characters in a file or directory name. + -- A . or .. path component can match the current or parent directories (including them in the results). + -- * matches zero or more characters in a file or directory name. -- * at the start of a file name ignores dot-named files and directories, by default. - -- ** (or any number of consecutive *'s) not followed by slash is equivalent to *. + -- ** (or zero or more consecutive *'s) not followed by slash is equivalent to *. -- A **/ component matches any number of directory parts. -- A **/ ignores dot-named directories in its starting and ending directories, by default. -- But **/ does search intermediate dot-named directories. Eg it can find a/.b/c. @@ -382,22 +382,26 @@ includedirectivep = do parentfile <- sourcePosFilePath pos let cwd = takeDirectory parentfile - -- Compile as a Glob Pattern (and do some extra error checking). Can throw an error. - g <- case tryCompileWith compDefault{errorRecovery=False} expandedglob of - Left e -> - customFailure $ parseErrorAt off $ "Invalid glob pattern: " ++ e - Right _ | regexMatch (toRegex' "\\*\\*[^/]") expandedglob -> - customFailure $ parseErrorAt off $ "Invalid glob pattern: double star requires slash, use * or **/" - Right _ | "***" `isInfixOf` expandedglob -> - customFailure $ parseErrorAt off $ "Invalid glob pattern: too many stars, use * or **/" + -- Don't allow 3 or more stars. + when ("***" `isInfixOf` expandedglob) $ + customFailure $ parseErrorAt off $ "Invalid glob pattern: too many stars, use * or **" + + -- Make ** also match file name parts like zsh's GLOB_STAR_SHORT. + let + expandedglob' = + -- ** without a slash is equivalent to **/* + case regexReplace (toRegex' $ T.pack "\\*\\*([^/\\])") "**/*\\1" expandedglob of + Right s -> s + Left _ -> expandedglob -- ignore any error, there should be none + + -- Compile as a Pattern. Can throw an error. + g <- case tryCompileWith compDefault{errorRecovery=False} expandedglob' of + Left e -> customFailure $ parseErrorAt off $ "Invalid glob pattern: " ++ e Right x -> pure x let isglob = not $ isLiteral g - -- Find all matched paths, in lexicographic order (the order ls would normally show them). - -- These might include directories or the current file. - paths <- liftIO $ - -- (dbg6 (parentfile <> " include: matched paths") . sort) <$> - globDir1 g cwd + -- Find all matched paths. These might include directories or the current file. + paths <- liftIO $ globDir1 g cwd -- Exclude any directories or symlinks to directories, and canonicalise, and sort. files <- liftIO $ @@ -405,13 +409,12 @@ includedirectivep = do >>= mapM canonicalizePath <&> sort - -- If a glob was used, exclude any intermediate dot directories that were searched. - -- As noted above, while **/ ignores dot dirs in the starting and ending dirs, + -- Work around a Glob bug with dot dirs: while **/ ignores dot dirs in the starting and ending dirs, -- it does search dot dirs in between those two (something that should be fixed in Glob ?). - -- This seems likely to be inconvenient, eg when trying to avoid .git directories in subrepos. - -- So as an imperfect workaround: when using any glob, exclude all paths involving dot dirs. - -- Unfortunately this means valid globs like .dotdir/* will not succeed; only a literal - -- .dotdir/foo would work there. + -- This could be inconvenient, eg making it hard to avoid VCS directories in a source tree. + -- To work around: when any glob was used, paths involving dot dirs are excluded in post processing. + -- Unfortunately this means valid globs like .dotdir/* can't be used; only literal paths can match + -- things in dot dirs. let files2 = (if isglob then filter (not.hasdotdir) else id) files where @@ -429,7 +432,6 @@ includedirectivep = do dbg6 (parentfile <> " include: matched files" <> if isglob then " (excluding current file)" else "") $ (if isglob then filter (/= parentfile) else id) files2 - -- return (files3, isglob) return files3 -- Parse the given included file (and any deeper includes, recursively) diff --git a/hledger/test/journal/include/include.test b/hledger/test/journal/include/include.test index d0f5a5b8d..7ecaec9b4 100644 --- a/hledger/test/journal/include/include.test +++ b/hledger/test/journal/include/include.test @@ -66,45 +66,38 @@ $ hledger -f - print >2 /Invalid glob/ >= 1 -# ** 8. Two *'s without / -> invalid glob error -< -include ** -$ hledger -f- files ->2 /Invalid glob/ ->=1 - -# ** 9. Three or more *'s -> invalid glob error +# ** 8. Three or more *'s -> invalid glob error < include *** $ hledger -f- files >2 /Invalid glob/ >=1 -# ** 10. Including the current file literally -> cycle error. +# ** 9. Including the current file literally -> cycle error. $ hledger -f self.j files >2 /cycle/ >=1 -# ** 11. Including the current file via glob -> harmless, globs ignore current file. +# ** 10. Including the current file via glob -> harmless, globs ignore current file. $ hledger -f selfglob.j files | sed -E 's|.*hledger/test/journal/include/||' selfglob.j -# ** 12. Including a cycle, all literally -> cycle error +# ** 11. Including a cycle, all literally -> cycle error $ hledger -f .cycle/cycle.j files >2 /cycle/ >=1 -# ** 13. Including a cycle, involving globs -> cycle error +# ** 12. Including a cycle, involving globs -> cycle error $ hledger -f .cycle/cycleglob.j files >2 /cycle/ >=1 -# ** 14. Glob patterns ignore the current file (once). +# ** 13. Glob patterns ignore the current file (once). $ hledger -f a.j files | sed -E 's|.*hledger/test/journal/include/||' a.j a2.j -# ** 15. Include */**/*.j -> all non-dot .j files in or below non-dot subdirectories. +# ** 14. Include */**/*.j -> all non-dot .j files in or below non-dot subdirectories. < include */**/*.j $ hledger -f - files | sed -E 's|.*hledger/test/journal/include/||' @@ -113,6 +106,15 @@ b/b.j b/bb/bb.j c/c.j +# ** 15. ** without a slash can also match filename start, equivalent to **/* -> same result as above. +< +include */**.j +$ hledger -f - files | sed -E 's|.*hledger/test/journal/include/||' +- +b/b.j +b/bb/bb.j +c/c.j + # ** 16. To avoid intermediate dot dirs in the above, we exclude all glob-matched paths involving dot dirs. # So this does not find b/bb/.dotdir/dotdirbb.j, unfortunately: < From c74d8bf6fba82253bd76cc47bab790231cd0ccb6 Mon Sep 17 00:00:00 2001 From: Simon Michael Date: Tue, 15 Jul 2025 20:48:41 -0700 Subject: [PATCH 24/28] doc: include: update docs --- hledger/hledger.m4.md | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/hledger/hledger.m4.md b/hledger/hledger.m4.md index be17d1ca5..90f1a3d14 100644 --- a/hledger/hledger.m4.md +++ b/hledger/hledger.m4.md @@ -2513,30 +2513,40 @@ include SOMEFILE ``` This has the same effect as if SOMEFILE's content was inlined at this point. -(Any `include`s within SOMEFILE will also be inlined, recursively.) +(With any include directives in SOMEFILE processed similarly, recursively.) -Only journal files can include other files. -They can include journal, timeclock or timedot files, but not CSV files. +Only journal files can include other files. They can include journal, timeclock or timedot files, but not CSV files. -If the file path begins with a tilde, that means home directory: `include ~/main.journal`. +If the file path begins with a tilde, that means your home directory: `include ~/main.journal`. If it begins with a slash, it is an absolute path: `include /home/user/main.journal`. +Otherwise it is relative to the including file's folder: `include ../finances/main.journal`. -Otherwise it is relative to the current file's folder: `include finances/main.journal`. +Also, the path may have a file type prefix to force a specific file format, overriding the file extension(s) +(as described in [Data formats](#data-formats)): `include timedot:notes/2023.md`. -Also, the path may have a file type prefix to force a specific file format -(as described in [Data formats](#data-formats)): `include timedot:~/notes/2023*.md`. +The path may contain [glob patterns](https://en.wikipedia.org/wiki/Glob_(programming)) +to match multiple files. +hledger's globs are similar to zsh's: +`?` to match any character; +`[a-z]` to match any character in a range; +`*` to match zero or more characters that aren't a path separator (like `/`); +`**` to match zero or more subdirectories and/or zero or more characters at the start of a file name; +etc. +Also, hledger's globs always exclude the including file itself. +So, you can do -The path may contain [glob patterns] to match multiple files, eg: `include *.journal`. -Note, the current file is always excluded from the matched paths. -(Though include cycles are still possible, and will be reported as an error.) +- `include *.journal` to include all other journal files in the current directory (excluding [dot files](https://en.wikipedia.org/wiki/Hidden_file_and_hidden_directory)) +- `include **.journal` to include all other journal files in this directory and below (excluding dot directories/files) +- `include timelogs/2???.timedot` to include all timedot files named like a year number. -The special glob pattern `**` matches any number of path components. -It's useful for searching subdirectories. -Eg to include all .journal files below the current directory: `include **/*.journal`. +If you are using many, or deeply nested, include files, and have an error that's hard to pinpoint: +a good troubleshooting command is `hledger files --debug=6` (or 7). -[glob patterns]: https://hackage.haskell.org/package/Glob-0.9.2/docs/System-FilePath-Glob.html#v:compile + + + ## `P` directive From 0ea239503c1f0a015a0c30ef894c018cd3143186 Mon Sep 17 00:00:00 2001 From: Simon Michael Date: Thu, 17 Jul 2025 07:49:39 -0700 Subject: [PATCH 25/28] dev: rename the hidden --timeclock-old flag to --old-timeclock --timeclock-old is still supported as well, for now, even though this flag is not documented yet (it has appeared in changelogs/release notes and in issue discussion). --- hledger/Hledger/Cli/CliOptions.hs | 2 +- hledger/test/timeclock.test | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/hledger/Hledger/Cli/CliOptions.hs b/hledger/Hledger/Cli/CliOptions.hs index a1f4eeb79..091c703b9 100644 --- a/hledger/Hledger/Cli/CliOptions.hs +++ b/hledger/Hledger/Cli/CliOptions.hs @@ -298,7 +298,7 @@ hiddenflagsformainmode = [ ,flagNone ["pretty-tables"] (setopt "pretty" "always") "legacy flag that was renamed" ,flagNone ["anon"] (setboolopt "anon") "deprecated, renamed to --obfuscate" -- #2133, handled by anonymiseByOpts ,flagNone ["obfuscate"] (setboolopt "obfuscate") "slightly obfuscate hledger's output. Warning, does not give privacy. Formerly --anon." -- #2133, handled by maybeObfuscate - ,flagNone ["timeclock-old"] (setboolopt "oldtimeclock") "don't pair timeclock entries by account name" + ,flagNone ["old-timeclock", "timeclock-old"] (setboolopt "oldtimeclock") "don't pair timeclock entries by account name" ,flagReq ["rules-file"] (\s opts -> Right $ setopt "rules" s opts) "RULESFILE" "was renamed to --rules" ] diff --git a/hledger/test/timeclock.test b/hledger/test/timeclock.test index ec59b9f48..ecf5a95c0 100644 --- a/hledger/test/timeclock.test +++ b/hledger/test/timeclock.test @@ -147,7 +147,7 @@ $ hledger -f timeclock:- print >= -# ** 12. The --timeclock-old flag reverts to the old behavior. +# ** 12. The --old-timeclock flag reverts to the old behavior. < i 2009/1/1 08:00:00 o 2009/1/1 09:00:00 stuff on checkout record is ignored @@ -157,7 +157,7 @@ o 2009/1/2 09:00:00 i 2009/1/3 08:00:00 some:account name and a description o 2009/1/3 09:00:00 -$ hledger --timeclock-old -f timeclock:- print +$ hledger --old-timeclock -f timeclock:- print > 2009-01-01 * 08:00-09:00 () 1.00h From 01e0c34b850855f35555dda1873ba149a05f76ea Mon Sep 17 00:00:00 2001 From: Simon Michael Date: Thu, 17 Jul 2025 07:51:05 -0700 Subject: [PATCH 26/28] dev: Hledger.Utils.IO: getFlag helper, like getOpt --- hledger-lib/Hledger/Utils/IO.hs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/hledger-lib/Hledger/Utils/IO.hs b/hledger-lib/Hledger/Utils/IO.hs index 0ad91e577..fea742dce 100644 --- a/hledger-lib/Hledger/Utils/IO.hs +++ b/hledger-lib/Hledger/Utils/IO.hs @@ -52,6 +52,7 @@ module Hledger.Utils.IO ( -- * Command line parsing progArgs, + getFlag, getOpt, parseYN, parseYNA, @@ -500,6 +501,15 @@ progArgs = unsafePerformIO getArgs -- a few cases involving --color (see useColorOnStdoutUnsafe) -- --debug +-- | Given one or more long or short flag names, +-- report whether this flag is present in the command line. +-- Concatenated short flags (-a -b written as -ab) are not supported. +getFlag :: [String] -> IO Bool +getFlag names = do + let flags = map toFlag names + args <- getArgs + return $ any (`elem` args) flags + -- | Given one or more long or short option names, read the rightmost value of this option from the command line arguments. -- If the value is missing raise an error. -- Concatenated short flags (-a -b written as -ab) are not supported. From b7e35f84a21fea3e6517daea14c80c08685a080a Mon Sep 17 00:00:00 2001 From: Simon Michael Date: Thu, 17 Jul 2025 08:00:08 -0700 Subject: [PATCH 27/28] imp: include: add hidden --old-glob flag to restore old dot behaviour This disables the workaround for Glob#49, allowing glob patterns to find dot files and traverse dot directories again (sometimes too much). --- hledger-lib/Hledger/Read/JournalReader.hs | 35 ++++++++++++++--------- hledger/Hledger/Cli/CliOptions.hs | 1 + hledger/test/journal/include/include.test | 8 +++++- 3 files changed, 29 insertions(+), 15 deletions(-) diff --git a/hledger-lib/Hledger/Read/JournalReader.hs b/hledger-lib/Hledger/Read/JournalReader.hs index abf65fa27..3343e4cf3 100644 --- a/hledger-lib/Hledger/Read/JournalReader.hs +++ b/hledger-lib/Hledger/Read/JournalReader.hs @@ -304,10 +304,8 @@ directivep = (do ) "directive" -- | Parse an include directive, and the file(s) it refers to, possibly recursively. --- include's argument is a file path or glob pattern, optionally with a file type prefix. --- ~ at the start is expanded to the user's home directory. --- Relative paths are relative to the current file. --- Examples: foo.j, ../foo/bar.j, timedot:/foo/2020*, *.journal +-- include's argument is a file path or glob pattern (see findMatchedFiles for details), +-- optionally with a file type prefix. Relative paths are relative to the current file. includedirectivep :: MonadIO m => ErroringJournalParser m () includedirectivep = do -- save the position @@ -330,19 +328,19 @@ includedirectivep = do paths <- findMatchedFiles off pos glb -- XXX worth the trouble ? no - -- Handle duplicates. Some complexities here: + -- Comprehensively exclude files already processed. Some complexities here: -- If this include directive uses a glob pattern, remove duplicates. -- Ie if this glob pattern matches any files we have already processed (or the current file), -- due to multiple includes in sequence or in a cycle, exclude those files so they're not processed again. -- If this include directive uses a literal file path, don't remove duplicates. -- Multiple includes in sequence will cause the included file to be processed multiple times. -- Multiple includes forming a cycle will be detected and reported as an error in parseIncludedFile. - let paths' = paths -- if isglob then filter (const True) paths else paths + -- let paths' = if isglob then filter (...) paths else paths -- if there was a reader prefix, apply it to all the file paths let prefixedpaths = case mprefix of - Nothing -> paths' - Just fmt -> map ((show fmt++":")++) paths' + Nothing -> paths + Just fmt -> map ((show fmt++":")++) paths -- Parse each one, as if inlined here. -- Reset the position to the `include` line, for error messages. @@ -352,13 +350,21 @@ includedirectivep = do where -- | Find the files matched by a literal path or a glob pattern. + -- Examples: foo.j, ../foo/bar.j, timedot:/foo/2020*, *.journal, **.journal. + -- -- Uses the current parse context for detecting the current directory and for error messages. -- Expands a leading tilde to the user's home directory. -- Converts ** without a slash to **/*, like zsh's GLOB_STAR_SHORT, so ** also matches file name parts. - -- Glob patterns at the start of a path component will exclude dot-named files and directories. -- Checks if any matched paths are directories and excludes those. -- Converts all matched paths to their canonical form. - findMatchedFiles :: MonadIO m => Int -> SourcePos -> FilePath -> JournalParser m [FilePath] + -- + -- Glob patterns never match dot files or files under dot directories, + -- even if it seems like they should; this is a workaround for Glob bug #49. + -- This workaround is disabled if the --old-glob flag is present in the command line + -- (detected with unsafePerformIO; it's not worth a ton of boilerplate). + -- In that case, be aware ** recursive globs will search intermediate dot directories. + + findMatchedFiles :: (MonadIO m) => Int -> SourcePos -> FilePath -> JournalParser m [FilePath] findMatchedFiles off pos globpattern = do -- Some notes about the Glob library that we use (related: https://github.com/Deewiant/glob/issues/49): @@ -410,13 +416,14 @@ includedirectivep = do <&> sort -- Work around a Glob bug with dot dirs: while **/ ignores dot dirs in the starting and ending dirs, - -- it does search dot dirs in between those two (something that should be fixed in Glob ?). + -- it does search dot dirs in between those two (Glob #49). -- This could be inconvenient, eg making it hard to avoid VCS directories in a source tree. - -- To work around: when any glob was used, paths involving dot dirs are excluded in post processing. + -- We work around as follows: when any glob was used, paths involving dot dirs are excluded in post processing. -- Unfortunately this means valid globs like .dotdir/* can't be used; only literal paths can match - -- things in dot dirs. + -- things in dot dirs. An --old-glob command line flag disables this workaround, for backward compatibility. + oldglobflag <- liftIO $ getFlag ["old-glob"] let - files2 = (if isglob then filter (not.hasdotdir) else id) files + files2 = (if isglob && not oldglobflag then filter (not.hasdotdir) else id) files where hasdotdir p = any isdotdir $ splitPath p where diff --git a/hledger/Hledger/Cli/CliOptions.hs b/hledger/Hledger/Cli/CliOptions.hs index 091c703b9..42d797b56 100644 --- a/hledger/Hledger/Cli/CliOptions.hs +++ b/hledger/Hledger/Cli/CliOptions.hs @@ -299,6 +299,7 @@ hiddenflagsformainmode = [ ,flagNone ["anon"] (setboolopt "anon") "deprecated, renamed to --obfuscate" -- #2133, handled by anonymiseByOpts ,flagNone ["obfuscate"] (setboolopt "obfuscate") "slightly obfuscate hledger's output. Warning, does not give privacy. Formerly --anon." -- #2133, handled by maybeObfuscate ,flagNone ["old-timeclock", "timeclock-old"] (setboolopt "oldtimeclock") "don't pair timeclock entries by account name" + ,flagNone ["old-glob"] (setboolopt "oldglob") "don't always exclude dot files/dirs to work around Glob bug" ,flagReq ["rules-file"] (\s opts -> Right $ setopt "rules" s opts) "RULESFILE" "was renamed to --rules" ] diff --git a/hledger/test/journal/include/include.test b/hledger/test/journal/include/include.test index 7ecaec9b4..70932b359 100644 --- a/hledger/test/journal/include/include.test +++ b/hledger/test/journal/include/include.test @@ -123,9 +123,15 @@ $ hledger -f - files | sed -E 's|.*hledger/test/journal/include/||' >2 /No files were matched/ # sed hides the non-zero exit code -# ** 17. Only a literal path can find it. +# ** 17. This workaround can be disabled with --old-glob, for now. +$ hledger -f - files --old-glob | sed -E 's|.*hledger/test/journal/include/||' +- +b/.dotdir/dotdirb.j + +# ** 18. A literal path can always match dot files/dirs. < include b/.dotdir/dotdirb.j $ hledger -f - files | sed -E 's|.*hledger/test/journal/include/||' - b/.dotdir/dotdirb.j + From d5ff7fe4543f305f0cd4cf7d155d35bbc7dfee3c Mon Sep 17 00:00:00 2001 From: Simon Michael Date: Thu, 17 Jul 2025 08:10:34 -0700 Subject: [PATCH 28/28] ;doc: include: mention glob pattern's dot limitation and the --old-glob flag --- hledger/hledger.m4.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/hledger/hledger.m4.md b/hledger/hledger.m4.md index 90f1a3d14..a0c3e173a 100644 --- a/hledger/hledger.m4.md +++ b/hledger/hledger.m4.md @@ -2540,15 +2540,16 @@ So, you can do - `include **.journal` to include all other journal files in this directory and below (excluding dot directories/files) - `include timelogs/2???.timedot` to include all timedot files named like a year number. +There is a limitation: hledger's globs always exclude paths involving dot files or dot directories. +This is a workaround for unavoidable dot directory traversal; +you can disable it and revert to older behaviour with the `--old-glob` flag, for now. + If you are using many, or deeply nested, include files, and have an error that's hard to pinpoint: a good troubleshooting command is `hledger files --debug=6` (or 7). - - - ## `P` directive The `P` directive declares a market price, which is