From 76dc6d089ad7204b6f1defb1d2386408ba80c2e5 Mon Sep 17 00:00:00 2001 From: Simon Michael Date: Thu, 14 Aug 2025 12:18:59 +0100 Subject: [PATCH] feat:import:archive: archive data files, and process oldest first --- hledger-lib/Hledger/Read/RulesReader.hs | 110 +++++++++++++++++------- hledger-lib/Hledger/Utils/IO.hs | 5 +- hledger/Hledger/Cli/Commands/Import.md | 55 ++++++++---- hledger/hledger.m4.md | 22 ++++- 4 files changed, 136 insertions(+), 56 deletions(-) diff --git a/hledger-lib/Hledger/Read/RulesReader.hs b/hledger-lib/Hledger/Read/RulesReader.hs index d4c99b5e9..b6f9141c6 100644 --- a/hledger-lib/Hledger/Read/RulesReader.hs +++ b/hledger-lib/Hledger/Read/RulesReader.hs @@ -22,6 +22,7 @@ Most of the code for reading rules files and csv files is in this module. {-# LANGUAGE ViewPatterns #-} {-# OPTIONS_GHC -Wno-unrecognised-pragmas #-} {-# LANGUAGE LambdaCase #-} +{-# LANGUAGE TupleSections #-} --- ** exports module Hledger.Read.RulesReader ( @@ -67,10 +68,10 @@ import qualified Data.Text as T import qualified Data.Text.Encoding as T import qualified Data.Text.IO as T import Data.Time ( Day, TimeZone, UTCTime, LocalTime, ZonedTime(ZonedTime), - defaultTimeLocale, getCurrentTimeZone, localDay, parseTimeM, utcToLocalTime, localTimeToUTC, zonedTimeToUTC) -import Safe (atMay, headMay, lastMay, readMay) -import System.FilePath ((), takeDirectory, takeExtension, stripExtension, takeFileName) -import System.IO (Handle, hClose) + defaultTimeLocale, getCurrentTimeZone, localDay, parseTimeM, utcToLocalTime, localTimeToUTC, zonedTimeToUTC, utctDay) +import Safe (atMay, headMay, lastMay, readMay, headDef) +import System.FilePath ((), takeDirectory, takeExtension, stripExtension, takeFileName, takeBaseName, (<.>)) +import System.IO (Handle, hClose, hPutStrLn, stderr) import qualified Data.Csv as Cassava import qualified Data.Csv.Parser.Megaparsec as CassavaMegaparsec import qualified Data.ByteString as B @@ -84,7 +85,7 @@ import Hledger.Data import Hledger.Utils import Hledger.Read.Common (aliasesFromOpts, Reader(..), InputOpts(..), amountp, statusp, journalFinalise, accountnamep, transactioncommentp, postingcommentp ) import Hledger.Write.Csv -import System.Directory (doesFileExist, getHomeDirectory) +import System.Directory (doesFileExist, getHomeDirectory, renameFile, getModificationTime, createDirectoryIfMissing) import Data.Either (fromRight) import Control.DeepSeq (deepseq) @@ -110,41 +111,64 @@ getDownloadDir = do home <- getHomeDirectory return $ home "Downloads" -- XXX --- | Parse and post-process a "Journal" from the given rules file path, or give an error. --- A data file is inferred from the @source@ rule, otherwise from a similarly-named file --- in the same directory. --- The source rule can specify a glob pattern and supports ~ for home directory. --- If it is a bare filename it will be relative to the defaut download directory --- on this system. If is a relative file path it will be relative to the rules --- file's directory. When a glob pattern matches multiple files, the alphabetically --- last is used. (Eg in case of multiple numbered downloads, the highest-numbered --- will be used.) +-- | Read, parse and post-process a "Journal" from the given rules file, or give an error. +-- -- The provided handle, or a --rules option, are ignored by this reader. --- Balance assertions are not checked. +-- A data file is inferred from the @source@ rule, otherwise from a similarly-named file in the same directory. +-- The source rule supports ~ for home directory. +-- If it is a bare filename, its directory is assumed to be ~/Downloads. +-- If is a relative file path, it is assumed to be relative to the rules file's directory. +-- The source rule can specify a glob pattern. +-- If the glob pattern matches multiple files, the newest (last modified) file is used, +-- unless the import command is running and archiving is enabled, in which case the oldest file is used. +-- When the import command is running and archiving is enabled, after a successful read +-- the data file is archived in an archive directory (data/ next to the rules file, auto-created). +-- Balance assertions are not checked by this reader. +-- parse :: InputOpts -> FilePath -> Handle -> ExceptT String IO Journal -parse iopts f h = do - lift $ hClose h -- We don't need it - rules <- readRulesFile $ dbg4 "reading rules file" f +parse iopts rulesfile h = do + lift $ hClose h -- We don't need it (XXX why ?) + -- XXX higher-than usual debug level for file reading to bypass excessive noise from elsewhere, normally 6 or 7 + rules <- readRulesFile $ dbg4 "reading rules file" rulesfile + + let + -- XXX How can we know when the command is import, and if it's a dry run ? In a hacky way, currently. + args = progArgs + cmd = headDef "" $ dropWhile ((=="-").take 1) args + importcmd = cmd `elem` ["import", "imp"] + dryrun = any (`elem` args) ["--dry-run", "--dry"] + importing = importcmd && not dryrun + archiving = importing && isJust (getDirective "import" rules) + rulesdir = takeDirectory rulesfile + archivedir = rulesdir "data" + mdatafile <- liftIO $ do - dldir <- getDownloadDir - let rulesdir = takeDirectory f + dldir <- getDownloadDir -- look here for the data file if it's specified without a directory let msource = T.unpack <$> getDirective "source" rules - fs <- case msource of - Just src -> expandGlob dir (dbg4 "source" src) >>= sortByModTime <&> dbg4 ("matched files"<>desc<>", newest first") - where (dir,desc) = if isFileName src then (dldir," in download directory") else (rulesdir,"") - Nothing -> return [maybe err (dbg4 "inferred source") $ dataFileFor f] -- shouldn't fail, f has .rules extension - where err = error' $ "could not infer a data file for " <> f - return $ dbg4 "data file" $ headMay fs + -- WISH: when not importing, and the source rule matches no files, read the latest archived file + datafiles <- case msource of + Just glb -> expandGlob dir (dbg4 "source" glb) >>= sortByModTime <&> dbg4 ("matched files"<>desc<>", newest first") + where (dir,desc) = if isFileName glb then (dldir," in download directory") else (rulesdir,"") + Nothing -> return [maybe err (dbg4 "inferred source") $ dataFileFor rulesfile] -- shouldn't fail, f has .rules extension + where err = error' $ "could not infer a data file for " <> rulesfile + return $ case datafiles of + [] -> Nothing + [f] | importcmd -> dbg4 "importing" <$> Just f + [f] -> dbg4 "reading" <$> Just f + fs | importcmd && archiving -> dbg4 "importing oldest file" <$> headMay fs + fs | importcmd -> dbg4 "importing newest file" <$> lastMay fs + fs -> dbg4 "reading newest file" <$> lastMay fs + case mdatafile of Nothing -> return nulljournal -- data file specified by source rule was not found - Just dat -> do - exists <- liftIO $ doesFileExist dat - if not (dat=="-" || exists) + Just datafile -> do + exists <- liftIO $ doesFileExist datafile + if not (datafile=="-" || exists) then return nulljournal -- data file inferred from rules file name was not found else do - dath <- liftIO $ openFileOrStdin dat - readJournalFromCsv (Just $ Left rules) dat dath Nothing + datafileh <- liftIO $ openFileOrStdin datafile + readJournalFromCsv (Just $ Left rules) datafile datafileh Nothing -- apply any command line account aliases. Can fail with a bad replacement pattern. >>= liftEither . journalApplyAliases (aliasesFromOpts iopts) -- journalFinalise assumes the journal's items are @@ -152,7 +176,24 @@ parse iopts f h = do -- But here they are already properly ordered. So we'd -- better preemptively reverse them once more. XXX inefficient . journalReverse - >>= journalFinalise iopts{balancingopts_=(balancingopts_ iopts){ignore_assertions_=True}} f "" + >>= journalFinalise iopts{balancingopts_=(balancingopts_ iopts){ignore_assertions_=True}} rulesfile "" + >>= \j -> do + when archiving $ liftIO $ archiveTo datafile archivedir + return j + +-- | Move a file to the given directory, creating the directory (and parents) if needed, +-- showing informational output on stderr. +archiveTo :: FilePath -> FilePath -> IO () +archiveTo datafile archivedir = do + createDirectoryIfMissing True archivedir + hPutStrLn stderr $ "archiving " <> datafile + datafilemodtime <- getModificationTime datafile + let + archivefilename = takeBaseName datafile <.> datafilemoddate <.> takeExtension datafile + where datafilemoddate = show $ utctDay datafilemodtime + archivefile = archivedir archivefilename + hPutStrLn stderr $ " as " <> archivefile + renameFile datafile archivefile --- ** reading rules files --- *** rules utilities @@ -392,10 +433,12 @@ Grammar for the CSV conversion rules, more or less: RULES: RULE* -RULE: ( SOURCE | FIELD-LIST | FIELD-ASSIGNMENT | CONDITIONAL-BLOCK | SKIP | TIMEZONE | NEWEST-FIRST | INTRA-DAY-REVERSED | DATE-FORMAT | DECIMAL-MARK | COMMENT | BLANK ) NEWLINE +RULE: ( SOURCE | ARCHIVE | FIELD-LIST | FIELD-ASSIGNMENT | CONDITIONAL-BLOCK | SKIP | TIMEZONE | NEWEST-FIRST | INTRA-DAY-REVERSED | DATE-FORMAT | DECIMAL-MARK | COMMENT | BLANK ) NEWLINE SOURCE: source SPACE FILEPATH +ARCHIVE: archive + FIELD-LIST: fields SPACE FIELD-NAME ( SPACE? , SPACE? FIELD-NAME )* FIELD-NAME: QUOTED-FIELD-NAME | BARE-FIELD-NAME @@ -518,6 +561,7 @@ directivep = (do directives :: [Text] directives = ["source" + ,"archive" ,"encoding" ,"date-format" ,"decimal-mark" diff --git a/hledger-lib/Hledger/Utils/IO.hs b/hledger-lib/Hledger/Utils/IO.hs index a8a5438d6..8456381bd 100644 --- a/hledger-lib/Hledger/Utils/IO.hs +++ b/hledger-lib/Hledger/Utils/IO.hs @@ -131,7 +131,6 @@ import Data.FileEmbed (makeRelativeToProject, embedStringFile) import Data.Functor ((<&>)) import Data.List hiding (uncons) import Data.Maybe (isJust, catMaybes) -import Data.Ord (comparing, Down (Down)) import qualified Data.Text as T import Data.Text.Encoding.Error (UnicodeException) import qualified Data.Text.IO as T @@ -415,11 +414,11 @@ expandPath curdir p = (if isRelative p then (curdir ) else id) <$> expandHome expandGlob :: FilePath -> FilePath -> IO [FilePath] expandGlob curdir p = expandPath curdir p >>= glob <&> sort -- PARTIAL: --- | Given a list of existing file paths, sort them by modification time, most recent first. +-- | Given a list of existing file paths, sort them by modification time (from oldest to newest). sortByModTime :: [FilePath] -> IO [FilePath] sortByModTime fs = do ftimes <- forM fs $ \f -> do {t <- getModificationTime f; return (t,f)} - return $ map snd $ sortBy (comparing Data.Ord.Down) ftimes + return $ map snd $ sort ftimes -- | Like readFilePortably, but read all of the file before proceeding. readFileStrictly :: FilePath -> IO T.Text diff --git a/hledger/Hledger/Cli/Commands/Import.md b/hledger/Hledger/Cli/Commands/Import.md index ec46814ac..c339dd4e8 100644 --- a/hledger/Hledger/Cli/Commands/Import.md +++ b/hledger/Hledger/Cli/Commands/Import.md @@ -26,7 +26,7 @@ $ hledger import bank1-checking.csv bank1-savings.csv $ hledger import *.csv ``` -### Import preview +### Import dry run It's useful to preview the import by running first with `--dry-run`, to sanity check the range of dates being imported, @@ -160,28 +160,47 @@ as declared by [`commodity` directives](#commodity-directive) or inferred from t Related: [CSV > Amount decimal places](#amount-decimal-places). +### Import archiving + +When importing from a CSV rules file (`hledger import bank.rules`), +you can use the [archive rule](#archive) to enable automatic archiving of the data file. +After a successful import, the data file (specified by `source`) will be moved +to an archive folder (`data/`, next to the rules file, auto-created), +and renamed similar to the rules file, with a date. +This can be useful for troubleshooting, detecting variations in your banks' CSV data, +regenerating entries with improved rules, etc. + +The `archive` rule also causes `import` to handle `source` glob patterns differently: +when there are multiple matched files, it will pick the oldest, not the newest. + ### Import special cases -If you have a download whose file name varies, you could rename it to a fixed name after each download. -Or you could use a [CSV `source` rule](#source) with a suitable glob pattern, -and import [from the .rules file](#reading-files-specified-by-rule) instead of the data file. - -Here's a situation where you would need to run `import` with care: -say you download `bank.csv`, but forget to import it or delete it. -And next month you download it again. This time your web browser may save it as `bank (2).csv`. -So now each of these may have data not included in the other. -And a `source` rule with a glob pattern would match only the most recent file. -So in this case you should import from each one in turn, in the correct order, taking care to use the same filename each time: - -```cli -$ hledger import bank.csv -$ mv 'bank (2).csv' bank.csv -$ hledger import bank.csv -``` +#### Deduplication Here are two kinds of "deduplication" which `import` does not handle -(and generally should not, since these can happen legitimately in financial data): +(and should not, because these can happen legitimately in financial data): - Two or more of the new CSV records are identical, and generate identical new journal entries. - A new CSV record generates a journal entry identical to one(s) already in the journal. +#### Varying file name + +If you have a download whose file name varies, you could rename it to a fixed name after each download. +Or you could use a [CSV `source` rule](#source) with a suitable glob pattern, +and import [from the .rules file](#reading-files-specified-by-rule). + +#### Multiple versions + +Say you download `bank.csv`, import it, but forget to delete it from your downloads folder. +The next time you download it, your web browser will save it as (eg) `bank (2).csv`. +The [source rule](#source)'s glob patterns are for just this situation: +instead of specifying `source bank.csv`, specify `source bank*.csv`. +Then `hledger -f bank.rules CMD` or `hledger import bank.rules` +will automatically pick the newest matched file (`bank (2).csv`). + +Alternately, what if you download, but forget to import or delete, then download again ? +Now each of `bank.csv` and `bank (2).csv` might contain data that's not in the other, and not in your journal. +In this case, it's best to import each of them in turn, oldest first +(otherwise, overlap detection could cause new records to be skipped). +Enabling [import archiving](import-archiving) ensures this. +Then `hledger import bank.rules; hledger import bank.rules` will import and archive first `bank.csv`, then `bank (2).csv`. diff --git a/hledger/hledger.m4.md b/hledger/hledger.m4.md index 9df41b72e..b0da703bd 100644 --- a/hledger/hledger.m4.md +++ b/hledger/hledger.m4.md @@ -3280,15 +3280,33 @@ in your system's downloads directory (`~/Downloads`, currently): source Checking1.csv ``` -And if you specify a glob pattern, hledger will read the most recent of the matched files -(useful with repeated downloads): +And if you specify a glob pattern, hledger will read the newest (most recently modified) of the matched files, +which is useful eg if your browser has saved multiple versions of a download: ```rules source Checking1*.csv ``` +This enables a convenient workflow where you just download CSV files to the default place, then run `hledger import rules/*`. +Once they have been imported, you can discard them or ignore them. + See also ["Working with CSV > Reading files specified by rule"](#reading-files-specified-by-rule). +## `archive` + +The `archive` rule can be used together with `source` to make importing a little more convenient. +It affects only the [import](#import) command. When enabled, + +- `import` will process multiple `source` glob matches oldest first. + So if you have multiple versions of a download, repeated imports will process them in chronological order. + +- After successfully importing a `source`-specified file, + `import` will move it to an archive directory (`data/` next to the rules file, auto-created), + and rename it to `RULESFILENAME.MODIFICATIONDATE.DOWNLOADEXT`. + +Archiving imported files in this way is completely optional, but it can be useful for troubleshooting, +detecting variations in your banks' CSV data, regenerating entries with improved rules, etc. + ## `encoding` ```rules