feat:import:archive: archive data files, and process oldest first

2025-08-14 12:18:59 +01:00 · 2025-08-14 12:18:59 +01:00 · 76dc6d089a
commit 76dc6d089a
parent db7783b16d
4 changed files with 136 additions and 56 deletions
--- a/hledger-lib/Hledger/Read/RulesReader.hs
+++ b/hledger-lib/Hledger/Read/RulesReader.hs
@ -22,6 +22,7 @@ Most of the code for reading rules files and csv files is in this module.
 {-# LANGUAGE ViewPatterns         #-}
 {-# OPTIONS_GHC -Wno-unrecognised-pragmas #-}
 {-# LANGUAGE LambdaCase #-}
+{-# LANGUAGE TupleSections #-}

 --- ** exports
 module Hledger.Read.RulesReader (
@ -67,10 +68,10 @@ import qualified Data.Text as T
 import qualified Data.Text.Encoding as T
 import qualified Data.Text.IO as T
 import Data.Time ( Day, TimeZone, UTCTime, LocalTime, ZonedTime(ZonedTime),
-  defaultTimeLocale, getCurrentTimeZone, localDay, parseTimeM, utcToLocalTime, localTimeToUTC, zonedTimeToUTC)
-import Safe (atMay, headMay, lastMay, readMay)
-import System.FilePath ((</>), takeDirectory, takeExtension, stripExtension, takeFileName)
-import System.IO       (Handle, hClose)
+  defaultTimeLocale, getCurrentTimeZone, localDay, parseTimeM, utcToLocalTime, localTimeToUTC, zonedTimeToUTC, utctDay)
+import Safe (atMay, headMay, lastMay, readMay, headDef)
+import System.FilePath ((</>), takeDirectory, takeExtension, stripExtension, takeFileName, takeBaseName, (<.>))
+import System.IO       (Handle, hClose, hPutStrLn, stderr)
 import qualified Data.Csv as Cassava
 import qualified Data.Csv.Parser.Megaparsec as CassavaMegaparsec
 import qualified Data.ByteString as B
@ -84,7 +85,7 @@ import Hledger.Data
 import Hledger.Utils
 import Hledger.Read.Common (aliasesFromOpts, Reader(..), InputOpts(..), amountp, statusp, journalFinalise, accountnamep, transactioncommentp, postingcommentp )
 import Hledger.Write.Csv
-import System.Directory (doesFileExist, getHomeDirectory)
+import System.Directory (doesFileExist, getHomeDirectory, renameFile, getModificationTime, createDirectoryIfMissing)
 import Data.Either (fromRight)
 import Control.DeepSeq (deepseq)

@ -110,41 +111,64 @@ getDownloadDir = do
  home <- getHomeDirectory
  return $ home </> "Downloads"  -- XXX

-- | Parse and post-process a "Journal" from the given rules file path, or give an error.
-- A data file is inferred from the @source@ rule, otherwise from a similarly-named file
-- in the same directory.
-- The source rule can specify a glob pattern and supports ~ for home directory.
-- If it is a bare filename it will be relative to the defaut download directory
-- on this system. If is a relative file path it will be relative to the rules
-- file's directory. When a glob pattern matches multiple files, the alphabetically
-- last is used. (Eg in case of multiple numbered downloads, the highest-numbered
-- will be used.)
+-- | Read, parse and post-process a "Journal" from the given rules file, or give an error.
+--
 -- The provided handle, or a --rules option, are ignored by this reader.
-- Balance assertions are not checked.
+-- A data file is inferred from the @source@ rule, otherwise from a similarly-named file in the same directory.
+-- The source rule supports ~ for home directory.
+-- If it is a bare filename, its directory is assumed to be ~/Downloads.
+-- If is a relative file path, it is assumed to be relative to the rules file's directory.
+-- The source rule can specify a glob pattern.
+-- If the glob pattern matches multiple files, the newest (last modified) file is used,
+-- unless the import command is running and archiving is enabled, in which case the oldest file is used.
+-- When the import command is running and archiving is enabled, after a successful read
+-- the data file is archived in an archive directory (data/ next to the rules file, auto-created).
+-- Balance assertions are not checked by this reader.
+--
 parse :: InputOpts -> FilePath -> Handle -> ExceptT String IO Journal
-parse iopts f h = do
-  lift $ hClose h -- We don't need it
-  rules <- readRulesFile $ dbg4 "reading rules file" f
+parse iopts rulesfile h = do
+  lift $ hClose h -- We don't need it (XXX why ?)
+
  -- XXX higher-than usual debug level for file reading to bypass excessive noise from elsewhere, normally 6 or 7
+  rules <- readRulesFile $ dbg4 "reading rules file" rulesfile
+
+  let
+    -- XXX How can we know when the command is import, and if it's a dry run ? In a hacky way, currently.
+    args = progArgs
+    cmd = headDef "" $ dropWhile ((=="-").take 1) args
+    importcmd = cmd `elem` ["import", "imp"]
+    dryrun = any (`elem` args) ["--dry-run", "--dry"]
+    importing = importcmd && not dryrun
+    archiving = importing && isJust (getDirective "import" rules)
+    rulesdir = takeDirectory rulesfile
+    archivedir = rulesdir </> "data"
+
  mdatafile <- liftIO $ do
-    dldir <- getDownloadDir
-    let rulesdir = takeDirectory f
+    dldir <- getDownloadDir  -- look here for the data file if it's specified without a directory
    let msource = T.unpack <$> getDirective "source" rules
-    fs <- case msource of
-            Just src -> expandGlob dir (dbg4 "source" src) >>= sortByModTime <&> dbg4 ("matched files"<>desc<>", newest first")
-              where (dir,desc) = if isFileName src then (dldir," in download directory") else (rulesdir,"")
-            Nothing  -> return [maybe err (dbg4 "inferred source") $ dataFileFor f]  -- shouldn't fail, f has .rules extension
-              where err = error' $ "could not infer a data file for " <> f
-    return $ dbg4 "data file" $ headMay fs
+    -- WISH: when not importing, and the source rule matches no files, read the latest archived file 
+    datafiles <- case msource of
+            Just glb -> expandGlob dir (dbg4 "source" glb) >>= sortByModTime <&> dbg4 ("matched files"<>desc<>", newest first")
+              where (dir,desc) = if isFileName glb then (dldir," in download directory") else (rulesdir,"")
+            Nothing  -> return [maybe err (dbg4 "inferred source") $ dataFileFor rulesfile]  -- shouldn't fail, f has .rules extension
+              where err = error' $ "could not infer a data file for " <> rulesfile
+    return $ case datafiles of
+      []                          -> Nothing
+      [f] | importcmd             -> dbg4 "importing"             <$> Just f
+      [f]                         -> dbg4 "reading"               <$> Just f
+      fs | importcmd && archiving -> dbg4 "importing oldest file" <$> headMay fs
+      fs | importcmd              -> dbg4 "importing newest file" <$> lastMay fs
+      fs                          -> dbg4 "reading newest file"   <$> lastMay fs
+
  case mdatafile of
    Nothing -> return nulljournal  -- data file specified by source rule was not found
-    Just dat -> do
-      exists <- liftIO $ doesFileExist dat
-      if not (dat=="-" || exists)
+    Just datafile -> do
+      exists <- liftIO $ doesFileExist datafile
+      if not (datafile=="-" || exists)
      then return nulljournal      -- data file inferred from rules file name was not found
      else do
-        dath <- liftIO $ openFileOrStdin dat
-        readJournalFromCsv (Just $ Left rules) dat dath Nothing
+        datafileh <- liftIO $ openFileOrStdin datafile
+        readJournalFromCsv (Just $ Left rules) datafile datafileh Nothing
        -- apply any command line account aliases. Can fail with a bad replacement pattern.
        >>= liftEither . journalApplyAliases (aliasesFromOpts iopts)
            -- journalFinalise assumes the journal's items are
@ -152,7 +176,24 @@ parse iopts f h = do
            -- But here they are already properly ordered. So we'd
            -- better preemptively reverse them once more. XXX inefficient
            . journalReverse
-        >>= journalFinalise iopts{balancingopts_=(balancingopts_ iopts){ignore_assertions_=True}} f ""
+        >>= journalFinalise iopts{balancingopts_=(balancingopts_ iopts){ignore_assertions_=True}} rulesfile ""
+        >>= \j -> do
+          when archiving $ liftIO $ archiveTo datafile archivedir
+          return j
+
+-- | Move a file to the given directory, creating the directory (and parents) if needed,
+-- showing informational output on stderr.
+archiveTo :: FilePath -> FilePath -> IO ()
+archiveTo datafile archivedir = do
+  createDirectoryIfMissing True archivedir
+  hPutStrLn stderr $ "archiving " <> datafile
+  datafilemodtime <- getModificationTime datafile
+  let
+    archivefilename = takeBaseName datafile <.> datafilemoddate <.> takeExtension datafile
+      where datafilemoddate = show $ utctDay datafilemodtime
+    archivefile = archivedir </> archivefilename
+  hPutStrLn stderr $ " as " <> archivefile
+  renameFile datafile archivefile

 --- ** reading rules files
 --- *** rules utilities
@ -392,10 +433,12 @@ Grammar for the CSV conversion rules, more or less:

 RULES: RULE*

-RULE: ( SOURCE | FIELD-LIST | FIELD-ASSIGNMENT | CONDITIONAL-BLOCK | SKIP | TIMEZONE | NEWEST-FIRST | INTRA-DAY-REVERSED | DATE-FORMAT | DECIMAL-MARK | COMMENT | BLANK ) NEWLINE
+RULE: ( SOURCE | ARCHIVE | FIELD-LIST | FIELD-ASSIGNMENT | CONDITIONAL-BLOCK | SKIP | TIMEZONE | NEWEST-FIRST | INTRA-DAY-REVERSED | DATE-FORMAT | DECIMAL-MARK | COMMENT | BLANK ) NEWLINE

 SOURCE: source SPACE FILEPATH

+ARCHIVE: archive
+
 FIELD-LIST: fields SPACE FIELD-NAME ( SPACE? , SPACE? FIELD-NAME )*

 FIELD-NAME: QUOTED-FIELD-NAME | BARE-FIELD-NAME
@ -518,6 +561,7 @@ directivep = (do
 directives :: [Text]
 directives =
  ["source"
+  ,"archive"
  ,"encoding"
  ,"date-format"
  ,"decimal-mark"
--- a/hledger-lib/Hledger/Utils/IO.hs
+++ b/hledger-lib/Hledger/Utils/IO.hs
@ -131,7 +131,6 @@ import           Data.FileEmbed (makeRelativeToProject, embedStringFile)
 import           Data.Functor ((<&>))
 import           Data.List hiding (uncons)
 import           Data.Maybe (isJust, catMaybes)
-import           Data.Ord (comparing, Down (Down))
 import qualified Data.Text as T
 import           Data.Text.Encoding.Error (UnicodeException)
 import qualified Data.Text.IO as T
@ -415,11 +414,11 @@ expandPath curdir p = (if isRelative p then (curdir </>) else id) <$> expandHome
 expandGlob :: FilePath -> FilePath -> IO [FilePath]
 expandGlob curdir p = expandPath curdir p >>= glob <&> sort  -- PARTIAL:

-- | Given a list of existing file paths, sort them by modification time, most recent first.
+-- | Given a list of existing file paths, sort them by modification time (from oldest to newest).
 sortByModTime :: [FilePath] -> IO [FilePath]
 sortByModTime fs = do
  ftimes <- forM fs $ \f -> do {t <- getModificationTime f; return (t,f)}
-  return $ map snd $ sortBy (comparing Data.Ord.Down) ftimes
+  return $ map snd $ sort ftimes

 -- | Like readFilePortably, but read all of the file before proceeding.
 readFileStrictly :: FilePath -> IO T.Text
--- a/hledger/Hledger/Cli/Commands/Import.md
+++ b/hledger/Hledger/Cli/Commands/Import.md
@ -26,7 +26,7 @@ $ hledger import bank1-checking.csv bank1-savings.csv
 $ hledger import *.csv
 ```

-### Import preview
+### Import dry run

 It's useful to preview the import by running first with `--dry-run`,
 to sanity check the range of dates being imported,
@ -160,28 +160,47 @@ as declared by [`commodity` directives](#commodity-directive) or inferred from t

 Related: [CSV > Amount decimal places](#amount-decimal-places).

+### Import archiving
+
+When importing from a CSV rules file (`hledger import bank.rules`),
+you can use the [archive rule](#archive) to enable automatic archiving of the data file.
+After a successful import, the data file (specified by `source`) will be moved
+to an archive folder (`data/`, next to the rules file, auto-created),
+and renamed similar to the rules file, with a date.
+This can be useful for troubleshooting, detecting variations in your banks' CSV data,
+regenerating entries with improved rules, etc.
+
+The `archive` rule also causes `import` to handle `source` glob patterns differently:
+when there are multiple matched files, it will pick the oldest, not the newest.
+
 ### Import special cases

-If you have a download whose file name varies, you could rename it to a fixed name after each download.
-Or you could use a [CSV `source` rule](#source) with a suitable glob pattern,
-and import [from the .rules file](#reading-files-specified-by-rule) instead of the data file.
-
-Here's a situation where you would need to run `import` with care:
-say you download `bank.csv`, but forget to import it or delete it.
-And next month you download it again. This time your web browser may save it as `bank (2).csv`.
-So now each of these may have data not included in the other.
-And a `source` rule with a glob pattern would match only the most recent file.
-So in this case you should import from each one in turn, in the correct order, taking care to use the same filename each time:
-
-```cli
-$ hledger import bank.csv
-$ mv 'bank (2).csv' bank.csv
-$ hledger import bank.csv
-```
+#### Deduplication

 Here are two kinds of "deduplication" which `import` does not handle
-(and generally should not, since these can happen legitimately in financial data):
+(and should not, because these can happen legitimately in financial data):

 - Two or more of the new CSV records are identical, and generate identical new journal entries.
 - A new CSV record generates a journal entry identical to one(s) already in the journal.

+#### Varying file name
+
+If you have a download whose file name varies, you could rename it to a fixed name after each download.
+Or you could use a [CSV `source` rule](#source) with a suitable glob pattern,
+and import [from the .rules file](#reading-files-specified-by-rule).
+
+#### Multiple versions
+
+Say you download `bank.csv`, import it, but forget to delete it from your downloads folder.
+The next time you download it, your web browser will save it as (eg) `bank (2).csv`.
+The [source rule](#source)'s glob patterns are for just this situation:
+instead of specifying `source bank.csv`, specify `source bank*.csv`.
+Then `hledger -f bank.rules CMD` or `hledger import bank.rules` 
+will automatically pick the newest matched file (`bank (2).csv`).
+
+Alternately, what if you download, but forget to import or delete, then download again ?
+Now each of `bank.csv` and `bank (2).csv` might contain data that's not in the other, and not in your journal.
+In this case, it's best to import each of them in turn, oldest first
+(otherwise, overlap detection could cause new records to be skipped).
+Enabling [import archiving](import-archiving) ensures this.
+Then `hledger import bank.rules; hledger import bank.rules` will import and archive first `bank.csv`, then `bank (2).csv`.
--- a/hledger/hledger.m4.md
+++ b/hledger/hledger.m4.md
@ -3280,15 +3280,33 @@ in your system's downloads directory (`~/Downloads`, currently):
 source Checking1.csv
 ```

-And if you specify a glob pattern, hledger will read the most recent of the matched files
-(useful with repeated downloads):
+And if you specify a glob pattern, hledger will read the newest (most recently modified) of the matched files,
+which is useful eg if your browser has saved multiple versions of a download:

 ```rules
 source Checking1*.csv
 ```

+This enables a convenient workflow where you just download CSV files to the default place, then run `hledger import rules/*`.
+Once they have been imported, you can discard them or ignore them.
+
 See also ["Working with CSV > Reading files specified by rule"](#reading-files-specified-by-rule).

+## `archive`
+
+The `archive` rule can be used together with `source` to make importing a little more convenient.
+It affects only the [import](#import) command. When enabled,
+
+- `import` will process multiple `source` glob matches oldest first.
+   So if you have multiple versions of a download, repeated imports will process them in chronological order.
+
+- After successfully importing a `source`-specified file, 
+  `import` will move it to an archive directory (`data/` next to the rules file, auto-created),
+  and rename it to `RULESFILENAME.MODIFICATIONDATE.DOWNLOADEXT`.
+
+Archiving imported files in this way is completely optional, but it can be useful for troubleshooting,
+detecting variations in your banks' CSV data, regenerating entries with improved rules, etc.
+
 ## `encoding`

 ```rules