From 81744d81a103e10f7bf609f9d288ce818af10301 Mon Sep 17 00:00:00 2001 From: Simon Michael Date: Mon, 1 Dec 2025 08:43:08 -0800 Subject: [PATCH] fix:journal:include: fix some regressions with glob matching in 1.50-1.50.3 Before 1.50, journal's include directive's handling of glob patterns (*, **, ?, etc.) had these limitations: - ** always searched intermediate dot directories - ** matched only directories, not files In 1.50-1.50.3, it had different limitations, some unintended: - it ignored all dot files, dot dirs, and symbolic links to dot dirs, even when explicitly mentioned in the pattern (unless using --old-glob) - it showed symbolic links dereferenced, eg in `hledger files` output Now it has fewer limitations, mainly this: - it ignores all dot files and dot dirs, even when explicitly mentioned (unless using --old-glob) Ie it no longer ignores symbolic links to dot dirs, and it no longer shows symbolic links dereferenced. Also: including the current file is now always harmless, whether using a glob pattern or not. Internally, file paths in the "include file stack" (jincludefilestack) are now just absolute, but not canonicalised; showing symbolic links un-dereferenced in output and error messages seems generally more useful. This might affect output elsewhere also. (Those paths are still canonicalised on the fly when checking for include cycles, not so efficiently: each time an include directive is parsed, all the current parent files and all the new glob-matched include files will be re-canonicalised. Hopefully this is unnoticeable.) --- hledger-lib/Hledger/Read/JournalReader.hs | 73 ++++++++++++++--------- hledger/test/journal/include/include.test | 64 ++++++++++---------- 2 files changed, 76 insertions(+), 61 deletions(-) diff --git a/hledger-lib/Hledger/Read/JournalReader.hs b/hledger-lib/Hledger/Read/JournalReader.hs index 4a06a09c3..cbc382e2e 100644 --- a/hledger-lib/Hledger/Read/JournalReader.hs +++ b/hledger-lib/Hledger/Read/JournalReader.hs @@ -32,6 +32,7 @@ Hledger.Read.Common, to avoid import cycles. {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE PackageImports #-} {-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE MultiWayIf #-} --- ** exports module Hledger.Read.JournalReader ( @@ -72,13 +73,14 @@ where --- ** imports import Control.Exception qualified as C -import Control.Monad (forM_, when, void, unless, filterM) +import Control.Monad (forM_, when, void, unless, filterM, forM) import Control.Monad.IO.Class (MonadIO, liftIO) import Control.Monad.Except (ExceptT(..), runExceptT) import Control.Monad.State.Strict (evalStateT,get,modify',put) import Control.Monad.Trans.Class (lift) import Data.Char (toLower) import Data.Either (isRight, lefts) +import Data.Functor ((<&>)) import Data.Map.Strict qualified as M import Data.Text (Text) import Data.String @@ -91,6 +93,7 @@ import Safe import Text.Megaparsec hiding (parse) import Text.Megaparsec.Char import Text.Printf +import System.Directory (canonicalizePath, doesFileExist, makeAbsolute) import System.FilePath import "Glob" System.FilePath.Glob hiding (match) -- import "filepattern" System.FilePattern.Directory @@ -103,8 +106,6 @@ import Hledger.Read.CsvReader qualified as CsvReader (reader) import Hledger.Read.RulesReader qualified as RulesReader (reader) import Hledger.Read.TimeclockReader qualified as TimeclockReader (reader) import Hledger.Read.TimedotReader qualified as TimedotReader (reader) -import System.Directory (canonicalizePath, doesFileExist) -import Data.Functor ((<&>)) --- ** doctest setup -- $setup @@ -341,7 +342,7 @@ includedirectivep iopts = do Just fmt -> map ((show fmt++":")++) paths -- Parse each one, as if inlined here. - forM_ prefixedpaths $ parseIncludedFile iopts eoff + forM_ prefixedpaths $ parseIncludedFile iopts where @@ -407,9 +408,30 @@ includedirectivep iopts = do -- Exclude any directories or symlinks to directories, and canonicalise, and sort. files <- liftIO $ filterM doesFileExist paths - >>= mapM canonicalizePath + >>= mapM makeAbsolute <&> sort + -- -- If a glob was used, exclude the current file, for convenience. + -- let + -- files3 = + -- dbg6 (parentf <> " include: matched files" <> if isglob then " (excluding current file)" else "") $ + -- (if isglob then filter (/= parentf) else id) files + + -- Throw an error if one of these files is among the grandparent files, forming a cycle. + -- Though, ignore the immediate parent file for convenience. XXX inconsistent - should it ignore all cyclic includes ? + -- We used to store the canonical paths, then switched to non-canonical paths for more useful output, + -- which means for each include directive we must re-canonicalise everything here; noticeable ? XXX + parentj <- get + let parentfiles = jincludefilestack parentj + cparentfiles <- liftIO $ mapM canonicalizePath parentfiles + let cparentf = take 1 parentfiles + files2 <- forM files $ \f -> do + cf <- liftIO $ canonicalizePath f + if + | [cf] == cparentf -> return cf -- current file - return canonicalised, will be excluded later + | cf `elem` drop 1 cparentfiles -> customFailure $ parseErrorAt off $ "This included file forms a cycle: " ++ f + | otherwise -> return f + -- Work around a Glob bug with dot dirs: while **/ ignores dot dirs in the starting and ending dirs, -- it does search dot dirs in between those two (Glob #49). -- This could be inconvenient, eg making it hard to avoid VCS directories in a source tree. @@ -418,38 +440,33 @@ includedirectivep iopts = do -- things in dot dirs. An --old-glob command line flag disables this workaround, for backward compatibility. oldglobflag <- liftIO $ getFlag ["old-glob"] let - files2 = (if isglob && not oldglobflag then filter (not.hasdotdir) else id) files + files3 = (if isglob && not oldglobflag then filter (not.hasdotdir) else id) files2 where hasdotdir p = any isdotdir $ splitPath p where isdotdir c = "." `isPrefixOf` c && "/" `isSuffixOf` c -- Throw an error if no files were matched. - when (null files2) $ - customFailure $ parseErrorAt off $ "No files were matched by glob pattern: " ++ globpattern + when (null files3) $ customFailure $ parseErrorAt off $ "No files were matched by: " ++ globpattern - -- If a glob was used, exclude the current file, for convenience. + -- If the current file got included, ignore it. + -- This is done last to avoid triggering the error above. let - files3 = - dbg6 (parentf <> " include: matched files" <> if isglob then " (excluding current file)" else "") $ - (if isglob then filter (/= parentf) else id) files2 + files4 = + dbg6 (parentf <> " include: matched files (excluding current file)") $ + filter (not.(`elem` cparentf)) files3 - return files3 + return files4 -- Parse the given included file (and any deeper includes, recursively) as if it was inlined in the current (parent) file. -- The offset of the start of the include directive in the parent file is provided for error messages. - parseIncludedFile :: MonadIO m => InputOpts -> Int -> PrefixedFilePath -> ErroringJournalParser m () - parseIncludedFile iopts1 eoff prefixedpath = do + parseIncludedFile :: MonadIO m => InputOpts -> PrefixedFilePath -> ErroringJournalParser m () + parseIncludedFile iopts1 prefixedpath = do let (_mprefix,filepath) = splitReaderPrefix prefixedpath - -- Throw an error if a cycle is detected - parentj <- get - let parentfilestack = jincludefilestack parentj - when (dbg7 "parseIncludedFile: reading" filepath `elem` parentfilestack) $ - customFailure $ parseErrorAt eoff $ "This included file forms a cycle: " ++ filepath - -- Read the file's content, or throw an error childInput <- lift $ readFilePortably filepath `orRethrowIOError` "failed to read a file" + parentj <- get let initChildj = newJournalWithParseStateFrom filepath parentj -- Choose a reader based on the file path prefix or file extension, @@ -496,13 +513,11 @@ includedirectivep iopts = do ,jincludefilestack = filepath : jincludefilestack j } --- Get the canonical path of the file referenced by this parse position. --- Symbolic links will be dereferenced. This probably will always succeed --- (since the parse file's path is probably always absolute). +-- Get the absolute path of the file referenced by this parse position. +-- (Symbolic links will not be dereferenced.) +-- This probably will always succeed, since the parse file's path is probably always absolute. sourcePosFilePath :: (MonadIO m) => SourcePos -> m FilePath -sourcePosFilePath = liftIO . canonicalizePath . sourceName --- "canonicalizePath is a very big hammer. If you only need an absolute path, makeAbsolute is sufficient" --- but we only do this once per include directive, seems ok to leave it as is. +sourcePosFilePath = liftIO . makeAbsolute . sourceName -- | Lift an IO action into the exception monad, rethrowing any IO -- error with the given message prepended. @@ -1263,8 +1278,8 @@ tests_JournalReader = testGroup "JournalReader" [ assertParse ignoredpricecommoditydirectivep "N $\n" ,testGroup "includedirectivep" [ - testCase "include" $ assertParseErrorE (includedirectivep definputopts) "include nosuchfile\n" "No files were matched by glob pattern: nosuchfile" - ,testCase "glob" $ assertParseErrorE (includedirectivep definputopts) "include nosuchfile*\n" "No files were matched by glob pattern: nosuchfile*" + testCase "include" $ assertParseErrorE (includedirectivep definputopts) "include nosuchfile\n" "No files were matched by: nosuchfile" + ,testCase "glob" $ assertParseErrorE (includedirectivep definputopts) "include nosuchfile*\n" "No files were matched by: nosuchfile*" ] ,testCase "marketpricedirectivep" $ assertParseEq marketpricedirectivep diff --git a/hledger/test/journal/include/include.test b/hledger/test/journal/include/include.test index 21f30ed52..252f6614b 100644 --- a/hledger/test/journal/include/include.test +++ b/hledger/test/journal/include/include.test @@ -35,7 +35,7 @@ include ~/a.j $ HOME="$PWD" hledger -f - print 2025-01-01 a ->= 0 +>= # ** 4. include followed by whitespace and/or comment -> missing argument error < @@ -79,10 +79,9 @@ $ hledger -f- files >2 /Invalid glob/ >=1 -# ** 10. Including the current file literally -> cycle error. -$ hledger -f self.j files ->2 /included file forms a cycle/ ->=1 +# ** 10. Including the current file literally -> harmless, current file is ignored. +$ hledger -f self.j files | sed -E 's|.*/||' +self.j # ** 11. Including a multi-file cycle, all literally -> cycle error. $ hledger -f cycle/cycle.j files @@ -90,7 +89,7 @@ $ hledger -f cycle/cycle.j files >=1 # ** 12. Including the current file via glob -> harmless, globs ignore current file. -$ hledger -f glob-self.j files | sed -E 's|.*hledger/test/journal/include/||' +$ hledger -f glob-self.j files | sed -E 's|.*/||' glob-self.j # ** 13. Including a cycle, involving globs -> cycle error @@ -101,6 +100,7 @@ $ hledger -f cycle/globcycle.j files # ** 14. Old-style deep glob **/*.j -> all non-dot .j files in current dir or non-dot subdirs. < include glob-deep/**/*.j + $ hledger -f - files | sed -E 's|.*/glob-deep/|glob-deep/|' - glob-deep/a.j @@ -112,8 +112,6 @@ glob-deep/b/bb/bb.j glob-deep/c/c.j # ** 15. Simpler deep glob **.j -> same as above. -< -include glob-deep/**/*.j $ hledger -f - files | sed -E 's|.*/glob-deep/|glob-deep/|' - glob-deep/a.j @@ -124,25 +122,18 @@ glob-deep/b/b.j glob-deep/b/bb/bb.j glob-deep/c/c.j -# ** 16. --old-glob preserves pre-1.50 glob behaviour: avoiding dot things at top (.c/) and bottom (b/.d.j), -# but searching intermediate dot dirs (b/.e/). +# ** 16. --old-glob preserves pre-1.50 glob behaviour: avoiding dot things at top (.c/) +# and at bottom (b/.d.j), searching intermediate dot dirs (b/.e/). < include glob-dot/**.j + $ hledger -f - files --old-glob | sed -E 's|.*/glob-dot/||' - a.j b/.e/e.j b/b.j -# ** 17. Otherwise, globs avoid all paths involving dot dirs (.c/, b/.d.j, b/.e/) (a workaround). -< -include glob-dot/**.j -$ hledger -f - files | sed -E 's|.*/glob-dot/||' -- -a.j -b/b.j - -# ** 18. A non-glob path can match dot things. +# ** 17. Dot things can be matched by an explicit non-glob path. < include glob-dot/.c/c.j include glob-dot/b/.d.j @@ -153,27 +144,36 @@ $ hledger -f - files | sed -E 's|.*/glob-dot/||' b/.d.j b/.e/e.j -# ** 19. A glob follows a symlink to a regular file (and shows the target file's path). +# ** 18. They can't be matched by a glob path, even if explicitly mentioned, by default. +< +include glob-dot/.c/c.j* +$ hledger -f - files | sed -E 's|.*/glob-dot/||' +>2 /No files were matched/ +#>=1 # sed hides the exit code + +# ** 19. They can be matched by an explicit glob path if --old-glob is used. +< +include glob-dot/.c/c.j* +$ hledger -f - files --old-glob | sed -E 's|.*/glob-dot/||' +- +.c/c.j + +# ** 20. A glob follows a symlink to a regular file. (And shows the symlink's path.) $ hledger -f glob-symlinked-file/a.j files | sed -E 's|.*/glob-symlinked-file/||' a.j -c.j +b.j -# ** 20. A glob follows a symlink to a regular dir. +# ** 21. A glob follows a symlink to a regular dir. $ hledger -f glob-symlinked-dir/a.j files | sed -E 's|.*/glob-symlinked-dir/||' a.j -c/c.j +b/c.j -# ** 21. A glob follows a symlink to a dot file. +# ** 22. A glob follows a symlink to a dot file. $ hledger -f glob-symlinked-dotfile/a.j files | sed -E 's|.*/glob-symlinked-dotfile/||' a.j -.c.j +b.j -# ** 22. A glob does not follow a symlink to a dot dir. +# ** 23. A glob follows a symlink to a dot dir. $ hledger -f glob-symlinked-dotdir/a.j files | sed -E 's|.*/glob-symlinked-dotdir/||' ->2 /No files were matched by glob pattern/ -#>=1 # sed hides this - -# ** 23. A glob does follow a symlink to a dot dir if --old-glob is used. -$ hledger -f glob-symlinked-dotdir/a.j files --old-glob | sed -E 's|.*/glob-symlinked-dotdir/||' a.j -.c/c.j +b/c.j