fix:journal:include: fix some regressions with glob matching in 1.50-1.50.3

Before 1.50, journal's include directive's handling of glob patterns (*, **, ?, etc.)
had these limitations:

- ** always searched intermediate dot directories
- ** matched only directories, not files

In 1.50-1.50.3, it had different limitations, some unintended:

- it ignored all dot files, dot dirs, and symbolic links to dot dirs,
  even when explicitly mentioned in the pattern (unless using --old-glob)
- it showed symbolic links dereferenced, eg in `hledger files` output

Now it has fewer limitations, mainly this:

- it ignores all dot files and dot dirs, even when explicitly mentioned (unless using --old-glob)

Ie it no longer ignores symbolic links to dot dirs, and it no longer shows symbolic links dereferenced.
Also: including the current file is now always harmless, whether using a glob pattern or not.

Internally, file paths in the "include file stack" (jincludefilestack) are now just absolute,
but not canonicalised; showing symbolic links un-dereferenced in output and error messages seems
generally more useful. This might affect output elsewhere also.
(Those paths are still canonicalised on the fly when checking for include cycles,
not so efficiently: each time an include directive is parsed, all the current parent files
and all the new glob-matched include files will be re-canonicalised.
Hopefully this is unnoticeable.)
This commit is contained in:
Simon Michael 2025-12-01 08:43:08 -08:00
parent 00f6a832d4
commit 81744d81a1
2 changed files with 76 additions and 61 deletions

View File

@ -32,6 +32,7 @@ Hledger.Read.Common, to avoid import cycles.
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE PackageImports #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE MultiWayIf #-}
--- ** exports
module Hledger.Read.JournalReader (
@ -72,13 +73,14 @@ where
--- ** imports
import Control.Exception qualified as C
import Control.Monad (forM_, when, void, unless, filterM)
import Control.Monad (forM_, when, void, unless, filterM, forM)
import Control.Monad.IO.Class (MonadIO, liftIO)
import Control.Monad.Except (ExceptT(..), runExceptT)
import Control.Monad.State.Strict (evalStateT,get,modify',put)
import Control.Monad.Trans.Class (lift)
import Data.Char (toLower)
import Data.Either (isRight, lefts)
import Data.Functor ((<&>))
import Data.Map.Strict qualified as M
import Data.Text (Text)
import Data.String
@ -91,6 +93,7 @@ import Safe
import Text.Megaparsec hiding (parse)
import Text.Megaparsec.Char
import Text.Printf
import System.Directory (canonicalizePath, doesFileExist, makeAbsolute)
import System.FilePath
import "Glob" System.FilePath.Glob hiding (match)
-- import "filepattern" System.FilePattern.Directory
@ -103,8 +106,6 @@ import Hledger.Read.CsvReader qualified as CsvReader (reader)
import Hledger.Read.RulesReader qualified as RulesReader (reader)
import Hledger.Read.TimeclockReader qualified as TimeclockReader (reader)
import Hledger.Read.TimedotReader qualified as TimedotReader (reader)
import System.Directory (canonicalizePath, doesFileExist)
import Data.Functor ((<&>))
--- ** doctest setup
-- $setup
@ -341,7 +342,7 @@ includedirectivep iopts = do
Just fmt -> map ((show fmt++":")++) paths
-- Parse each one, as if inlined here.
forM_ prefixedpaths $ parseIncludedFile iopts eoff
forM_ prefixedpaths $ parseIncludedFile iopts
where
@ -407,9 +408,30 @@ includedirectivep iopts = do
-- Exclude any directories or symlinks to directories, and canonicalise, and sort.
files <- liftIO $
filterM doesFileExist paths
>>= mapM canonicalizePath
>>= mapM makeAbsolute
<&> sort
-- -- If a glob was used, exclude the current file, for convenience.
-- let
-- files3 =
-- dbg6 (parentf <> " include: matched files" <> if isglob then " (excluding current file)" else "") $
-- (if isglob then filter (/= parentf) else id) files
-- Throw an error if one of these files is among the grandparent files, forming a cycle.
-- Though, ignore the immediate parent file for convenience. XXX inconsistent - should it ignore all cyclic includes ?
-- We used to store the canonical paths, then switched to non-canonical paths for more useful output,
-- which means for each include directive we must re-canonicalise everything here; noticeable ? XXX
parentj <- get
let parentfiles = jincludefilestack parentj
cparentfiles <- liftIO $ mapM canonicalizePath parentfiles
let cparentf = take 1 parentfiles
files2 <- forM files $ \f -> do
cf <- liftIO $ canonicalizePath f
if
| [cf] == cparentf -> return cf -- current file - return canonicalised, will be excluded later
| cf `elem` drop 1 cparentfiles -> customFailure $ parseErrorAt off $ "This included file forms a cycle: " ++ f
| otherwise -> return f
-- Work around a Glob bug with dot dirs: while **/ ignores dot dirs in the starting and ending dirs,
-- it does search dot dirs in between those two (Glob #49).
-- This could be inconvenient, eg making it hard to avoid VCS directories in a source tree.
@ -418,38 +440,33 @@ includedirectivep iopts = do
-- things in dot dirs. An --old-glob command line flag disables this workaround, for backward compatibility.
oldglobflag <- liftIO $ getFlag ["old-glob"]
let
files2 = (if isglob && not oldglobflag then filter (not.hasdotdir) else id) files
files3 = (if isglob && not oldglobflag then filter (not.hasdotdir) else id) files2
where
hasdotdir p = any isdotdir $ splitPath p
where
isdotdir c = "." `isPrefixOf` c && "/" `isSuffixOf` c
-- Throw an error if no files were matched.
when (null files2) $
customFailure $ parseErrorAt off $ "No files were matched by glob pattern: " ++ globpattern
when (null files3) $ customFailure $ parseErrorAt off $ "No files were matched by: " ++ globpattern
-- If a glob was used, exclude the current file, for convenience.
-- If the current file got included, ignore it.
-- This is done last to avoid triggering the error above.
let
files3 =
dbg6 (parentf <> " include: matched files" <> if isglob then " (excluding current file)" else "") $
(if isglob then filter (/= parentf) else id) files2
files4 =
dbg6 (parentf <> " include: matched files (excluding current file)") $
filter (not.(`elem` cparentf)) files3
return files3
return files4
-- Parse the given included file (and any deeper includes, recursively) as if it was inlined in the current (parent) file.
-- The offset of the start of the include directive in the parent file is provided for error messages.
parseIncludedFile :: MonadIO m => InputOpts -> Int -> PrefixedFilePath -> ErroringJournalParser m ()
parseIncludedFile iopts1 eoff prefixedpath = do
parseIncludedFile :: MonadIO m => InputOpts -> PrefixedFilePath -> ErroringJournalParser m ()
parseIncludedFile iopts1 prefixedpath = do
let (_mprefix,filepath) = splitReaderPrefix prefixedpath
-- Throw an error if a cycle is detected
parentj <- get
let parentfilestack = jincludefilestack parentj
when (dbg7 "parseIncludedFile: reading" filepath `elem` parentfilestack) $
customFailure $ parseErrorAt eoff $ "This included file forms a cycle: " ++ filepath
-- Read the file's content, or throw an error
childInput <- lift $ readFilePortably filepath `orRethrowIOError` "failed to read a file"
parentj <- get
let initChildj = newJournalWithParseStateFrom filepath parentj
-- Choose a reader based on the file path prefix or file extension,
@ -496,13 +513,11 @@ includedirectivep iopts = do
,jincludefilestack = filepath : jincludefilestack j
}
-- Get the canonical path of the file referenced by this parse position.
-- Symbolic links will be dereferenced. This probably will always succeed
-- (since the parse file's path is probably always absolute).
-- Get the absolute path of the file referenced by this parse position.
-- (Symbolic links will not be dereferenced.)
-- This probably will always succeed, since the parse file's path is probably always absolute.
sourcePosFilePath :: (MonadIO m) => SourcePos -> m FilePath
sourcePosFilePath = liftIO . canonicalizePath . sourceName
-- "canonicalizePath is a very big hammer. If you only need an absolute path, makeAbsolute is sufficient"
-- but we only do this once per include directive, seems ok to leave it as is.
sourcePosFilePath = liftIO . makeAbsolute . sourceName
-- | Lift an IO action into the exception monad, rethrowing any IO
-- error with the given message prepended.
@ -1263,8 +1278,8 @@ tests_JournalReader = testGroup "JournalReader" [
assertParse ignoredpricecommoditydirectivep "N $\n"
,testGroup "includedirectivep" [
testCase "include" $ assertParseErrorE (includedirectivep definputopts) "include nosuchfile\n" "No files were matched by glob pattern: nosuchfile"
,testCase "glob" $ assertParseErrorE (includedirectivep definputopts) "include nosuchfile*\n" "No files were matched by glob pattern: nosuchfile*"
testCase "include" $ assertParseErrorE (includedirectivep definputopts) "include nosuchfile\n" "No files were matched by: nosuchfile"
,testCase "glob" $ assertParseErrorE (includedirectivep definputopts) "include nosuchfile*\n" "No files were matched by: nosuchfile*"
]
,testCase "marketpricedirectivep" $ assertParseEq marketpricedirectivep

View File

@ -35,7 +35,7 @@ include ~/a.j
$ HOME="$PWD" hledger -f - print
2025-01-01 a
>= 0
>=
# ** 4. include followed by whitespace and/or comment -> missing argument error
<
@ -79,10 +79,9 @@ $ hledger -f- files
>2 /Invalid glob/
>=1
# ** 10. Including the current file literally -> cycle error.
$ hledger -f self.j files
>2 /included file forms a cycle/
>=1
# ** 10. Including the current file literally -> harmless, current file is ignored.
$ hledger -f self.j files | sed -E 's|.*/||'
self.j
# ** 11. Including a multi-file cycle, all literally -> cycle error.
$ hledger -f cycle/cycle.j files
@ -90,7 +89,7 @@ $ hledger -f cycle/cycle.j files
>=1
# ** 12. Including the current file via glob -> harmless, globs ignore current file.
$ hledger -f glob-self.j files | sed -E 's|.*hledger/test/journal/include/||'
$ hledger -f glob-self.j files | sed -E 's|.*/||'
glob-self.j
# ** 13. Including a cycle, involving globs -> cycle error
@ -101,6 +100,7 @@ $ hledger -f cycle/globcycle.j files
# ** 14. Old-style deep glob **/*.j -> all non-dot .j files in current dir or non-dot subdirs.
<
include glob-deep/**/*.j
$ hledger -f - files | sed -E 's|.*/glob-deep/|glob-deep/|'
-
glob-deep/a.j
@ -112,8 +112,6 @@ glob-deep/b/bb/bb.j
glob-deep/c/c.j
# ** 15. Simpler deep glob **.j -> same as above.
<
include glob-deep/**/*.j
$ hledger -f - files | sed -E 's|.*/glob-deep/|glob-deep/|'
-
glob-deep/a.j
@ -124,25 +122,18 @@ glob-deep/b/b.j
glob-deep/b/bb/bb.j
glob-deep/c/c.j
# ** 16. --old-glob preserves pre-1.50 glob behaviour: avoiding dot things at top (.c/) and bottom (b/.d.j),
# but searching intermediate dot dirs (b/.e/).
# ** 16. --old-glob preserves pre-1.50 glob behaviour: avoiding dot things at top (.c/)
# and at bottom (b/.d.j), searching intermediate dot dirs (b/.e/).
<
include glob-dot/**.j
$ hledger -f - files --old-glob | sed -E 's|.*/glob-dot/||'
-
a.j
b/.e/e.j
b/b.j
# ** 17. Otherwise, globs avoid all paths involving dot dirs (.c/, b/.d.j, b/.e/) (a workaround).
<
include glob-dot/**.j
$ hledger -f - files | sed -E 's|.*/glob-dot/||'
-
a.j
b/b.j
# ** 18. A non-glob path can match dot things.
# ** 17. Dot things can be matched by an explicit non-glob path.
<
include glob-dot/.c/c.j
include glob-dot/b/.d.j
@ -153,27 +144,36 @@ $ hledger -f - files | sed -E 's|.*/glob-dot/||'
b/.d.j
b/.e/e.j
# ** 19. A glob follows a symlink to a regular file (and shows the target file's path).
# ** 18. They can't be matched by a glob path, even if explicitly mentioned, by default.
<
include glob-dot/.c/c.j*
$ hledger -f - files | sed -E 's|.*/glob-dot/||'
>2 /No files were matched/
#>=1 # sed hides the exit code
# ** 19. They can be matched by an explicit glob path if --old-glob is used.
<
include glob-dot/.c/c.j*
$ hledger -f - files --old-glob | sed -E 's|.*/glob-dot/||'
-
.c/c.j
# ** 20. A glob follows a symlink to a regular file. (And shows the symlink's path.)
$ hledger -f glob-symlinked-file/a.j files | sed -E 's|.*/glob-symlinked-file/||'
a.j
c.j
b.j
# ** 20. A glob follows a symlink to a regular dir.
# ** 21. A glob follows a symlink to a regular dir.
$ hledger -f glob-symlinked-dir/a.j files | sed -E 's|.*/glob-symlinked-dir/||'
a.j
c/c.j
b/c.j
# ** 21. A glob follows a symlink to a dot file.
# ** 22. A glob follows a symlink to a dot file.
$ hledger -f glob-symlinked-dotfile/a.j files | sed -E 's|.*/glob-symlinked-dotfile/||'
a.j
.c.j
b.j
# ** 22. A glob does not follow a symlink to a dot dir.
# ** 23. A glob follows a symlink to a dot dir.
$ hledger -f glob-symlinked-dotdir/a.j files | sed -E 's|.*/glob-symlinked-dotdir/||'
>2 /No files were matched by glob pattern/
#>=1 # sed hides this
# ** 23. A glob does follow a symlink to a dot dir if --old-glob is used.
$ hledger -f glob-symlinked-dotdir/a.j files --old-glob | sed -E 's|.*/glob-symlinked-dotdir/||'
a.j
.c/c.j
b/c.j