fix:journal:include: fix some regressions with glob matching in 1.50-1.50.3

Before 1.50, journal's include directive's handling of glob patterns (*, **, ?, etc.)
had these limitations:

- ** always searched intermediate dot directories
- ** matched only directories, not files

In 1.50-1.50.3, it had different limitations, some unintended:

- it ignored all dot files, dot dirs, and symbolic links to dot dirs,
  even when explicitly mentioned in the pattern (unless using --old-glob)
- it showed symbolic links dereferenced, eg in `hledger files` output

Now it has fewer limitations, mainly this:

- it ignores all dot files and dot dirs, even when explicitly mentioned (unless using --old-glob)

Ie it no longer ignores symbolic links to dot dirs, and it no longer shows symbolic links dereferenced.
Also: including the current file is now always harmless, whether using a glob pattern or not.

Internally, file paths in the "include file stack" (jincludefilestack) are now just absolute,
but not canonicalised; showing symbolic links un-dereferenced in output and error messages seems
generally more useful. This might affect output elsewhere also.
(Those paths are still canonicalised on the fly when checking for include cycles,
not so efficiently: each time an include directive is parsed, all the current parent files
and all the new glob-matched include files will be re-canonicalised.
Hopefully this is unnoticeable.)
This commit is contained in:
Simon Michael 2025-12-01 08:43:08 -08:00
parent 00f6a832d4
commit 81744d81a1
2 changed files with 76 additions and 61 deletions

View File

@ -32,6 +32,7 @@ Hledger.Read.Common, to avoid import cycles.
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE PackageImports #-} {-# LANGUAGE PackageImports #-}
{-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE MultiWayIf #-}
--- ** exports --- ** exports
module Hledger.Read.JournalReader ( module Hledger.Read.JournalReader (
@ -72,13 +73,14 @@ where
--- ** imports --- ** imports
import Control.Exception qualified as C import Control.Exception qualified as C
import Control.Monad (forM_, when, void, unless, filterM) import Control.Monad (forM_, when, void, unless, filterM, forM)
import Control.Monad.IO.Class (MonadIO, liftIO) import Control.Monad.IO.Class (MonadIO, liftIO)
import Control.Monad.Except (ExceptT(..), runExceptT) import Control.Monad.Except (ExceptT(..), runExceptT)
import Control.Monad.State.Strict (evalStateT,get,modify',put) import Control.Monad.State.Strict (evalStateT,get,modify',put)
import Control.Monad.Trans.Class (lift) import Control.Monad.Trans.Class (lift)
import Data.Char (toLower) import Data.Char (toLower)
import Data.Either (isRight, lefts) import Data.Either (isRight, lefts)
import Data.Functor ((<&>))
import Data.Map.Strict qualified as M import Data.Map.Strict qualified as M
import Data.Text (Text) import Data.Text (Text)
import Data.String import Data.String
@ -91,6 +93,7 @@ import Safe
import Text.Megaparsec hiding (parse) import Text.Megaparsec hiding (parse)
import Text.Megaparsec.Char import Text.Megaparsec.Char
import Text.Printf import Text.Printf
import System.Directory (canonicalizePath, doesFileExist, makeAbsolute)
import System.FilePath import System.FilePath
import "Glob" System.FilePath.Glob hiding (match) import "Glob" System.FilePath.Glob hiding (match)
-- import "filepattern" System.FilePattern.Directory -- import "filepattern" System.FilePattern.Directory
@ -103,8 +106,6 @@ import Hledger.Read.CsvReader qualified as CsvReader (reader)
import Hledger.Read.RulesReader qualified as RulesReader (reader) import Hledger.Read.RulesReader qualified as RulesReader (reader)
import Hledger.Read.TimeclockReader qualified as TimeclockReader (reader) import Hledger.Read.TimeclockReader qualified as TimeclockReader (reader)
import Hledger.Read.TimedotReader qualified as TimedotReader (reader) import Hledger.Read.TimedotReader qualified as TimedotReader (reader)
import System.Directory (canonicalizePath, doesFileExist)
import Data.Functor ((<&>))
--- ** doctest setup --- ** doctest setup
-- $setup -- $setup
@ -341,7 +342,7 @@ includedirectivep iopts = do
Just fmt -> map ((show fmt++":")++) paths Just fmt -> map ((show fmt++":")++) paths
-- Parse each one, as if inlined here. -- Parse each one, as if inlined here.
forM_ prefixedpaths $ parseIncludedFile iopts eoff forM_ prefixedpaths $ parseIncludedFile iopts
where where
@ -407,9 +408,30 @@ includedirectivep iopts = do
-- Exclude any directories or symlinks to directories, and canonicalise, and sort. -- Exclude any directories or symlinks to directories, and canonicalise, and sort.
files <- liftIO $ files <- liftIO $
filterM doesFileExist paths filterM doesFileExist paths
>>= mapM canonicalizePath >>= mapM makeAbsolute
<&> sort <&> sort
-- -- If a glob was used, exclude the current file, for convenience.
-- let
-- files3 =
-- dbg6 (parentf <> " include: matched files" <> if isglob then " (excluding current file)" else "") $
-- (if isglob then filter (/= parentf) else id) files
-- Throw an error if one of these files is among the grandparent files, forming a cycle.
-- Though, ignore the immediate parent file for convenience. XXX inconsistent - should it ignore all cyclic includes ?
-- We used to store the canonical paths, then switched to non-canonical paths for more useful output,
-- which means for each include directive we must re-canonicalise everything here; noticeable ? XXX
parentj <- get
let parentfiles = jincludefilestack parentj
cparentfiles <- liftIO $ mapM canonicalizePath parentfiles
let cparentf = take 1 parentfiles
files2 <- forM files $ \f -> do
cf <- liftIO $ canonicalizePath f
if
| [cf] == cparentf -> return cf -- current file - return canonicalised, will be excluded later
| cf `elem` drop 1 cparentfiles -> customFailure $ parseErrorAt off $ "This included file forms a cycle: " ++ f
| otherwise -> return f
-- Work around a Glob bug with dot dirs: while **/ ignores dot dirs in the starting and ending dirs, -- Work around a Glob bug with dot dirs: while **/ ignores dot dirs in the starting and ending dirs,
-- it does search dot dirs in between those two (Glob #49). -- it does search dot dirs in between those two (Glob #49).
-- This could be inconvenient, eg making it hard to avoid VCS directories in a source tree. -- This could be inconvenient, eg making it hard to avoid VCS directories in a source tree.
@ -418,38 +440,33 @@ includedirectivep iopts = do
-- things in dot dirs. An --old-glob command line flag disables this workaround, for backward compatibility. -- things in dot dirs. An --old-glob command line flag disables this workaround, for backward compatibility.
oldglobflag <- liftIO $ getFlag ["old-glob"] oldglobflag <- liftIO $ getFlag ["old-glob"]
let let
files2 = (if isglob && not oldglobflag then filter (not.hasdotdir) else id) files files3 = (if isglob && not oldglobflag then filter (not.hasdotdir) else id) files2
where where
hasdotdir p = any isdotdir $ splitPath p hasdotdir p = any isdotdir $ splitPath p
where where
isdotdir c = "." `isPrefixOf` c && "/" `isSuffixOf` c isdotdir c = "." `isPrefixOf` c && "/" `isSuffixOf` c
-- Throw an error if no files were matched. -- Throw an error if no files were matched.
when (null files2) $ when (null files3) $ customFailure $ parseErrorAt off $ "No files were matched by: " ++ globpattern
customFailure $ parseErrorAt off $ "No files were matched by glob pattern: " ++ globpattern
-- If a glob was used, exclude the current file, for convenience. -- If the current file got included, ignore it.
-- This is done last to avoid triggering the error above.
let let
files3 = files4 =
dbg6 (parentf <> " include: matched files" <> if isglob then " (excluding current file)" else "") $ dbg6 (parentf <> " include: matched files (excluding current file)") $
(if isglob then filter (/= parentf) else id) files2 filter (not.(`elem` cparentf)) files3
return files3 return files4
-- Parse the given included file (and any deeper includes, recursively) as if it was inlined in the current (parent) file. -- Parse the given included file (and any deeper includes, recursively) as if it was inlined in the current (parent) file.
-- The offset of the start of the include directive in the parent file is provided for error messages. -- The offset of the start of the include directive in the parent file is provided for error messages.
parseIncludedFile :: MonadIO m => InputOpts -> Int -> PrefixedFilePath -> ErroringJournalParser m () parseIncludedFile :: MonadIO m => InputOpts -> PrefixedFilePath -> ErroringJournalParser m ()
parseIncludedFile iopts1 eoff prefixedpath = do parseIncludedFile iopts1 prefixedpath = do
let (_mprefix,filepath) = splitReaderPrefix prefixedpath let (_mprefix,filepath) = splitReaderPrefix prefixedpath
-- Throw an error if a cycle is detected
parentj <- get
let parentfilestack = jincludefilestack parentj
when (dbg7 "parseIncludedFile: reading" filepath `elem` parentfilestack) $
customFailure $ parseErrorAt eoff $ "This included file forms a cycle: " ++ filepath
-- Read the file's content, or throw an error -- Read the file's content, or throw an error
childInput <- lift $ readFilePortably filepath `orRethrowIOError` "failed to read a file" childInput <- lift $ readFilePortably filepath `orRethrowIOError` "failed to read a file"
parentj <- get
let initChildj = newJournalWithParseStateFrom filepath parentj let initChildj = newJournalWithParseStateFrom filepath parentj
-- Choose a reader based on the file path prefix or file extension, -- Choose a reader based on the file path prefix or file extension,
@ -496,13 +513,11 @@ includedirectivep iopts = do
,jincludefilestack = filepath : jincludefilestack j ,jincludefilestack = filepath : jincludefilestack j
} }
-- Get the canonical path of the file referenced by this parse position. -- Get the absolute path of the file referenced by this parse position.
-- Symbolic links will be dereferenced. This probably will always succeed -- (Symbolic links will not be dereferenced.)
-- (since the parse file's path is probably always absolute). -- This probably will always succeed, since the parse file's path is probably always absolute.
sourcePosFilePath :: (MonadIO m) => SourcePos -> m FilePath sourcePosFilePath :: (MonadIO m) => SourcePos -> m FilePath
sourcePosFilePath = liftIO . canonicalizePath . sourceName sourcePosFilePath = liftIO . makeAbsolute . sourceName
-- "canonicalizePath is a very big hammer. If you only need an absolute path, makeAbsolute is sufficient"
-- but we only do this once per include directive, seems ok to leave it as is.
-- | Lift an IO action into the exception monad, rethrowing any IO -- | Lift an IO action into the exception monad, rethrowing any IO
-- error with the given message prepended. -- error with the given message prepended.
@ -1263,8 +1278,8 @@ tests_JournalReader = testGroup "JournalReader" [
assertParse ignoredpricecommoditydirectivep "N $\n" assertParse ignoredpricecommoditydirectivep "N $\n"
,testGroup "includedirectivep" [ ,testGroup "includedirectivep" [
testCase "include" $ assertParseErrorE (includedirectivep definputopts) "include nosuchfile\n" "No files were matched by glob pattern: nosuchfile" testCase "include" $ assertParseErrorE (includedirectivep definputopts) "include nosuchfile\n" "No files were matched by: nosuchfile"
,testCase "glob" $ assertParseErrorE (includedirectivep definputopts) "include nosuchfile*\n" "No files were matched by glob pattern: nosuchfile*" ,testCase "glob" $ assertParseErrorE (includedirectivep definputopts) "include nosuchfile*\n" "No files were matched by: nosuchfile*"
] ]
,testCase "marketpricedirectivep" $ assertParseEq marketpricedirectivep ,testCase "marketpricedirectivep" $ assertParseEq marketpricedirectivep

View File

@ -35,7 +35,7 @@ include ~/a.j
$ HOME="$PWD" hledger -f - print $ HOME="$PWD" hledger -f - print
2025-01-01 a 2025-01-01 a
>= 0 >=
# ** 4. include followed by whitespace and/or comment -> missing argument error # ** 4. include followed by whitespace and/or comment -> missing argument error
< <
@ -79,10 +79,9 @@ $ hledger -f- files
>2 /Invalid glob/ >2 /Invalid glob/
>=1 >=1
# ** 10. Including the current file literally -> cycle error. # ** 10. Including the current file literally -> harmless, current file is ignored.
$ hledger -f self.j files $ hledger -f self.j files | sed -E 's|.*/||'
>2 /included file forms a cycle/ self.j
>=1
# ** 11. Including a multi-file cycle, all literally -> cycle error. # ** 11. Including a multi-file cycle, all literally -> cycle error.
$ hledger -f cycle/cycle.j files $ hledger -f cycle/cycle.j files
@ -90,7 +89,7 @@ $ hledger -f cycle/cycle.j files
>=1 >=1
# ** 12. Including the current file via glob -> harmless, globs ignore current file. # ** 12. Including the current file via glob -> harmless, globs ignore current file.
$ hledger -f glob-self.j files | sed -E 's|.*hledger/test/journal/include/||' $ hledger -f glob-self.j files | sed -E 's|.*/||'
glob-self.j glob-self.j
# ** 13. Including a cycle, involving globs -> cycle error # ** 13. Including a cycle, involving globs -> cycle error
@ -101,6 +100,7 @@ $ hledger -f cycle/globcycle.j files
# ** 14. Old-style deep glob **/*.j -> all non-dot .j files in current dir or non-dot subdirs. # ** 14. Old-style deep glob **/*.j -> all non-dot .j files in current dir or non-dot subdirs.
< <
include glob-deep/**/*.j include glob-deep/**/*.j
$ hledger -f - files | sed -E 's|.*/glob-deep/|glob-deep/|' $ hledger -f - files | sed -E 's|.*/glob-deep/|glob-deep/|'
- -
glob-deep/a.j glob-deep/a.j
@ -112,8 +112,6 @@ glob-deep/b/bb/bb.j
glob-deep/c/c.j glob-deep/c/c.j
# ** 15. Simpler deep glob **.j -> same as above. # ** 15. Simpler deep glob **.j -> same as above.
<
include glob-deep/**/*.j
$ hledger -f - files | sed -E 's|.*/glob-deep/|glob-deep/|' $ hledger -f - files | sed -E 's|.*/glob-deep/|glob-deep/|'
- -
glob-deep/a.j glob-deep/a.j
@ -124,25 +122,18 @@ glob-deep/b/b.j
glob-deep/b/bb/bb.j glob-deep/b/bb/bb.j
glob-deep/c/c.j glob-deep/c/c.j
# ** 16. --old-glob preserves pre-1.50 glob behaviour: avoiding dot things at top (.c/) and bottom (b/.d.j), # ** 16. --old-glob preserves pre-1.50 glob behaviour: avoiding dot things at top (.c/)
# but searching intermediate dot dirs (b/.e/). # and at bottom (b/.d.j), searching intermediate dot dirs (b/.e/).
< <
include glob-dot/**.j include glob-dot/**.j
$ hledger -f - files --old-glob | sed -E 's|.*/glob-dot/||' $ hledger -f - files --old-glob | sed -E 's|.*/glob-dot/||'
- -
a.j a.j
b/.e/e.j b/.e/e.j
b/b.j b/b.j
# ** 17. Otherwise, globs avoid all paths involving dot dirs (.c/, b/.d.j, b/.e/) (a workaround). # ** 17. Dot things can be matched by an explicit non-glob path.
<
include glob-dot/**.j
$ hledger -f - files | sed -E 's|.*/glob-dot/||'
-
a.j
b/b.j
# ** 18. A non-glob path can match dot things.
< <
include glob-dot/.c/c.j include glob-dot/.c/c.j
include glob-dot/b/.d.j include glob-dot/b/.d.j
@ -153,27 +144,36 @@ $ hledger -f - files | sed -E 's|.*/glob-dot/||'
b/.d.j b/.d.j
b/.e/e.j b/.e/e.j
# ** 19. A glob follows a symlink to a regular file (and shows the target file's path). # ** 18. They can't be matched by a glob path, even if explicitly mentioned, by default.
<
include glob-dot/.c/c.j*
$ hledger -f - files | sed -E 's|.*/glob-dot/||'
>2 /No files were matched/
#>=1 # sed hides the exit code
# ** 19. They can be matched by an explicit glob path if --old-glob is used.
<
include glob-dot/.c/c.j*
$ hledger -f - files --old-glob | sed -E 's|.*/glob-dot/||'
-
.c/c.j
# ** 20. A glob follows a symlink to a regular file. (And shows the symlink's path.)
$ hledger -f glob-symlinked-file/a.j files | sed -E 's|.*/glob-symlinked-file/||' $ hledger -f glob-symlinked-file/a.j files | sed -E 's|.*/glob-symlinked-file/||'
a.j a.j
c.j b.j
# ** 20. A glob follows a symlink to a regular dir. # ** 21. A glob follows a symlink to a regular dir.
$ hledger -f glob-symlinked-dir/a.j files | sed -E 's|.*/glob-symlinked-dir/||' $ hledger -f glob-symlinked-dir/a.j files | sed -E 's|.*/glob-symlinked-dir/||'
a.j a.j
c/c.j b/c.j
# ** 21. A glob follows a symlink to a dot file. # ** 22. A glob follows a symlink to a dot file.
$ hledger -f glob-symlinked-dotfile/a.j files | sed -E 's|.*/glob-symlinked-dotfile/||' $ hledger -f glob-symlinked-dotfile/a.j files | sed -E 's|.*/glob-symlinked-dotfile/||'
a.j a.j
.c.j b.j
# ** 22. A glob does not follow a symlink to a dot dir. # ** 23. A glob follows a symlink to a dot dir.
$ hledger -f glob-symlinked-dotdir/a.j files | sed -E 's|.*/glob-symlinked-dotdir/||' $ hledger -f glob-symlinked-dotdir/a.j files | sed -E 's|.*/glob-symlinked-dotdir/||'
>2 /No files were matched by glob pattern/
#>=1 # sed hides this
# ** 23. A glob does follow a symlink to a dot dir if --old-glob is used.
$ hledger -f glob-symlinked-dotdir/a.j files --old-glob | sed -E 's|.*/glob-symlinked-dotdir/||'
a.j a.j
.c/c.j b/c.j