imp: include: more robust tests and glob pattern handling

This switches from filepattern back to Glob, which is more powerful.
New notes, implementation, workarounds and tests.
This commit is contained in:
Simon Michael 2025-07-14 17:13:14 -07:00
parent 1046f652b1
commit b4a1add267
4 changed files with 175 additions and 87 deletions

View File

@ -73,7 +73,7 @@ where
--- ** imports
import qualified Control.Monad.Fail as Fail (fail)
import qualified Control.Exception as C
import Control.Monad (forM_, when, void, unless)
import Control.Monad (forM_, when, void, unless, filterM)
import Control.Monad.IO.Class (MonadIO, liftIO)
import Control.Monad.Except (ExceptT(..), runExceptT)
import Control.Monad.State.Strict (evalStateT,get,modify',put)
@ -94,7 +94,7 @@ import Text.Megaparsec.Char
import Text.Printf
import System.FilePath
import "Glob" System.FilePath.Glob hiding (match)
import "filepattern" System.FilePattern.Directory
-- import "filepattern" System.FilePattern.Directory
import Hledger.Data
import Hledger.Read.Common
@ -104,7 +104,8 @@ import qualified Hledger.Read.CsvReader as CsvReader (reader)
import qualified Hledger.Read.RulesReader as RulesReader (reader)
import qualified Hledger.Read.TimeclockReader as TimeclockReader (reader)
import qualified Hledger.Read.TimedotReader as TimedotReader (reader)
import System.Directory (canonicalizePath)
import System.Directory (canonicalizePath, doesFileExist)
import Data.Functor ((<&>))
--- ** doctest setup
-- $setup
@ -313,30 +314,66 @@ includedirectivep = do
-- save the position
off <- getOffset
pos <- getSourcePos
-- parse
-- parse the directive
string "include"
lift skipNonNewlineSpaces1
prefixedglob <- rstrip . T.unpack <$> takeWhileP Nothing (`notElem` [';','\n'])
lift followingcommentp
-- find file(s)
let (mprefix,glb) = splitReaderPrefix prefixedglob
f <- sourcePosFilePath pos
when (null $ dbg6 (f <> " include: glob pattern") glb) $
customFailure $ parseErrorAt off $ "include needs a file path or glob pattern argument"
paths <- getFilePaths off pos glb
-- Find the file or glob-matched files (just the ones from this include directive), with some IO error checking.
-- Also report whether it was a literal path and not a glob pattern.
(paths, isliteral) <- findMatchedFiles off pos glb
-- XXX worth the troublel ?
-- Handle duplicates. Some complexities here:
--
-- If this include directive uses a glob pattern, remove duplicates.
-- Ie if this glob pattern matches any files we have already processed (or the current file),
-- due to multiple includes in sequence or in a cycle, exclude those files so they're not processed again.
--
-- If this include directive uses a literal file path, don't remove duplicates.
-- Multiple includes in sequence will cause the included file to be processed multiple times.
-- Multiple includes forming a cycle will be detected and reported as an error in parseIncludedFile.
let paths' = if isliteral then paths else filter (const True) paths
-- if there was a reader prefix, apply it to all the file paths
let prefixedpaths = case mprefix of
Nothing -> paths
Just fmt -> map ((show fmt++":")++) paths
-- parse them inline
Nothing -> paths'
Just fmt -> map ((show fmt++":")++) paths'
-- parse each one, as if inlined here
forM_ prefixedpaths $ parseIncludedFile off pos
where
-- Find the files matched by a glob pattern, if any, using filepattern.
-- | Find the files matched by a literal path or a glob pattern.
-- Uses the current parse context for detecting the current directory and for error messages.
-- This one also ignores all dotted directories (anything under .git/, foo/.secret/, etc.)
getFilePaths :: MonadIO m => Int -> SourcePos -> FilePath -> JournalParser m [FilePath]
getFilePaths off pos globpattern = do
-- Expands a leading tilde to the user's home directory.
-- Glob patterns at the start of a path component exclude dot-named files and directories.
--
-- Checks if any matched paths are directories and excludes those.
-- Converts all matched paths to their canonical form.
findMatchedFiles :: MonadIO m => Int -> SourcePos -> FilePath -> JournalParser m ([FilePath], Bool)
findMatchedFiles off pos globpattern = do
-- Some notes about the Glob library:
-- ----------------------------------
-- It does not expand tilde.
-- It does not canonicalise paths.
-- The results are not in any particular order.
-- The results can include directories.
-- DIRPAT/ is equivalent to DIRPAT, except results will end with // (double slash).
-- . or .. at the start of a pattern can match the current or parent directories.
-- * matches any number of characters in a file or directory name.
-- * at the start of a file name ignores dot-named files and directories, by default.
-- ** (or any number of consecutive *'s) not followed by slash is equivalent to *.
-- A **/ component matches any number of directory parts.
-- A **/ ignores dot-named directories in its starting and ending directories, by default.
-- But **/ does search intermediate dot-named directories. Eg it can find a/.b/c.
-- expand a tilde at the start of the glob pattern, or throw an error
expandedglob <- lift $ expandHomePath globpattern `orRethrowIOError` "failed to expand ~"
@ -345,29 +382,43 @@ includedirectivep = do
parentfile <- sourcePosFilePath pos
let cwd = takeDirectory parentfile
-- Find all matched files, in lexicographic order (the order ls would normally show them).
-- (This might include the current file.)
-- 1. Old implementation, using Glob XXX keep providing this via --old-glob for a bit
_g <- case tryCompileWith compDefault{errorRecovery=False} expandedglob of
Right x -> pure x
-- Compile as a Glob Pattern. Can throw an error.
g <- case tryCompileWith compDefault{errorRecovery=False} expandedglob of
Left e -> customFailure $ parseErrorAt off $ "Invalid glob pattern: " ++ e
_filepaths <- liftIO $ (dbg6 (parentfile <> " include: matched files") . sort) <$> globDir1 _g cwd
-- 2. New implementation, using filepattern
filepaths <- liftIO $
map (dbg6 "cwd" cwd </>)
-- . sort -- XXX needed ?
<$>
getDirectoryFilesIgnore cwd [expandedglob] ["**/.*/**"]
Right _ | "***" `isInfixOf` expandedglob -> customFailure $ parseErrorAt off $ "Invalid glob pattern: too many stars"
Right x -> pure x
-- Throw an error if no files (not even the current file) were matched.
when (null filepaths) $
customFailure $ parseErrorAt off $ "No files were matched by file pattern: " ++ globpattern
-- Find all matched paths, in lexicographic order (the order ls would normally show them).
-- These might include directories or the current file.
paths <- liftIO $
-- (dbg6 (parentfile <> " include: matched paths") . sort) <$>
globDir1 g cwd
-- If the current file was matched, exclude it now.
let filepaths' = filter (/= parentfile) filepaths
dbg6IO (parentfile <> " include: matched files (excluding current file)") filepaths'
-- Exclude any directories or symlinks to directories, and canonicalise
files <- liftIO $
filterM doesFileExist paths
>>= mapM canonicalizePath
<&> (dbg6 (parentfile <> " include: matched files") . sort)
pure filepaths'
-- If a glob was used: exclude any intermediate dot directories that were searched.
-- As noted above, while **/ ignores dot dirs in the starting and ending dirs,
-- it does search dot dirs in between those two (something that should be fixed in Glob ?).
-- This seems likely to be inconvenient, eg when trying to avoid .git directories in subrepos.
-- So as an imperfect workaround: when using any glob, exclude all paths involving dot dirs.
-- Unfortunately this means valid globs like .dotdir/* will not succeed; only a literal
-- .dotdir/foo would work there.
let
files' = if isLiteral g then files else filter (not.hasdotdir) files
where
hasdotdir p = any isdotdir $ splitPath p
where
isdotdir c = "." `isPrefixOf` c && "/" `isSuffixOf` c
-- Throw an error if no files were matched.
when (null files') $
customFailure $ parseErrorAt off $ "No files were matched by glob pattern: " ++ globpattern
return (files', isLiteral g)
-- Parse the given included file (and any deeper includes, recursively)
-- as if it was inlined in the current (parent) file.
@ -1195,8 +1246,8 @@ tests_JournalReader = testGroup "JournalReader" [
assertParse ignoredpricecommoditydirectivep "N $\n"
,testGroup "includedirectivep" [
testCase "include" $ assertParseErrorE includedirectivep "include nosuchfile\n" "No existing files match pattern: nosuchfile"
,testCase "glob" $ assertParseErrorE includedirectivep "include nosuchfile*\n" "No existing files match pattern: nosuchfile*"
testCase "include" $ assertParseErrorE includedirectivep "include nosuchfile\n" "No files were matched by glob pattern: nosuchfile"
,testCase "glob" $ assertParseErrorE includedirectivep "include nosuchfile*\n" "No files were matched by glob pattern: nosuchfile*"
]
,testCase "marketpricedirectivep" $ assertParseEq marketpricedirectivep

View File

@ -148,7 +148,6 @@ library
, extra >=1.7.11
, file-embed >=0.0.10
, filepath
, filepattern >=0.1.3
, hashtables >=1.2.3.1
, lucid
, megaparsec >=7.0.0 && <9.8
@ -209,7 +208,6 @@ test-suite doctest
, extra >=1.7.11
, file-embed >=0.0.10
, filepath
, filepattern >=0.1.3
, hashtables >=1.2.3.1
, lucid
, megaparsec >=7.0.0 && <9.8
@ -271,7 +269,6 @@ test-suite unittest
, extra >=1.7.11
, file-embed >=0.0.10
, filepath
, filepattern >=0.1.3
, hashtables >=1.2.3.1
, hledger-lib
, lucid

View File

@ -94,7 +94,6 @@ dependencies:
- utf8-string >=0.3.5
- extra >=1.7.11
- Glob >= 0.9
- filepattern >=0.1.3
ghc-options:
- -Wall

View File

@ -29,37 +29,7 @@ $ printf '2018/01/01\n (A) 1\n' >f.journal; hledger -f - print; rm -f f.journa
>=
# ** 3. include with no argument: argument error
<
include
$ hledger -f- files
>2 /include needs a.*argument/
>=1
# ** 4. include a nonexistent file: no files matched error
<
include nosuchfile
$ hledger -f- files
>2 /No files were matched/
>=1
# # ** 5. include a directory: no files matched error
# <
# include b
# $ hledger -f- files
# >2 /No files were matched/
# >=1
# ** 6. Including the current file is harmless, it's ignored automatically.
$ hledger -f self.j files
> /self\.j$/
# # ** 7. Including a cycle is harmless, the cycle is pruned automatically.
# $ hledger -f .cycle/cycle.j files
# > /cyc/
# >=1
# ** 8. A leading tilde is expanded to $HOME.
# ** 3. A leading tilde is expanded to $HOME.
<
include ~/included.journal
$ printf '2018/01/01\n (A) 1\n' >included.journal; HOME="$PWD" hledger -f - print; rm -rf included.journal
@ -68,31 +38,102 @@ $ printf '2018/01/01\n (A) 1\n' >included.journal; HOME="$PWD" hledger -f - pr
>= 0
# # ** 9. Glob patterns. Include a malformed glob -> glob error
# <
# include ***
# $ hledger -f- files
# >2 /malformed/
# >=1
# ** 4. include with no argument -> argument error
<
include
$ hledger -f- files
>2 /include needs a.*argument/
>=1
# ** 10. Include invalid glob patterns
# ** 5. include a nonexistent file -> no files matched error
<
include nosuchfile
$ hledger -f- files
>2 /No files were matched/
>=1
# ** 6. Including a directory literally -> no files matched error
<
include b
$ hledger -f- files
>2 /No files were matched/
>=1
# ** 7. Include invalid glob patterns -> invalid glob error
<
include [.journal
$ hledger -f - print
>2 //
>2 /Invalid glob/
>= 1
# # ** 11. Include ** -> all files
# $ hledger -f glob1.j files
# ** 8. Three or more *'s -> glob error
<
include ***
$ hledger -f- files
>2 /Invalid glob/
>=1
# # ** 12. Include **.EXT -> ?
# $ hledger -f glob2.j files
# ** 9. Including the current file literally -> cycle error.
$ hledger -f self.j files
>2 /cycle/
>=1
# # ** 13. Include */**.EXT -> ?
# $ hledger -f glob3.j files
# ** 10. Including the current file via glob -> cycle error.
$ hledger -f selfglob.j files
>2 /cycle/
>=1
# # ** 14. Include **/*.EXT -> all files in or below cwd with .EXT
# $ hledger -f glob4.j files
# ** 11. Including a cycle, all literally -> cycle error
$ hledger -f .cycle/cycle.j files
>2 /cycle/
>=1
# # ** 15. Include */**/*.EXT -> all files below cwd with .EXT
# $ hledger -f glob5.j files
# ** 12. Including a cycle, involving globs -> cycle error
$ hledger -f .cycle/cycleglob.j files
>2 /cycle/
>=1
# ** 13. Include ** -> cycle error (includes current file)
$ hledger -f glob1.j files
>2 /cycle/
>=1
# ** 14. Include **.j -> cycle error (includes current file)
$ hledger -f glob2.j files
>2 /cycle/
>=1
# ** 15. Include */**.j -> all .j files in subdirectories (**.j is same as *.j)
$ hledger -f glob3.j files | sed -E 's|.*hledger/test/journal/include/||'
glob3.j
b/b.j
c/c.j
# ** 16. Include **/*.j -> cycle error (includes current file)
$ hledger -f glob4.j files
>2 /cycle/
>=1
# ** 17. Include */**/*.j -> all non-dot .j files in or below non-dot subdirectories.
<
include */**/*.j
$ hledger -f - files | sed -E 's|.*hledger/test/journal/include/||'
-
b/b.j
b/bb/bb.j
c/c.j
# ** 18. To avoid intermediate dot dirs in the above, we exclude all glob-matched paths involving dot dirs.
# So this does not find b/bb/.dotdir/dotdirbb.j, unfortunately:
<
include b/.dotdir/*.j
$ hledger -f - files | sed -E 's|.*hledger/test/journal/include/||'
>2 /No files were matched/
# sed hides the non-zero exit code
# ** 19. Only a literal path can find it.
<
include b/.dotdir/dotdirb.j
$ hledger -f - files | sed -E 's|.*hledger/test/journal/include/||'
-
b/.dotdir/dotdirb.j