feat:import:archive: archive data files, and process oldest first

2025-08-14 12:18:59 +01:00 · 2025-08-14 12:18:59 +01:00 · 76dc6d089a
commit 76dc6d089a
parent db7783b16d
4 changed files with 136 additions and 56 deletions
--- a/hledger-lib/Hledger/Read/RulesReader.hs
+++ b/hledger-lib/Hledger/Read/RulesReader.hs
@ -22,6 +22,7 @@ Most of the code for reading rules files and csv files is in this module.
 {-# LANGUAGE ViewPatterns         #-}
 {-# OPTIONS_GHC -Wno-unrecognised-pragmas #-}
 {-# LANGUAGE LambdaCase #-}
 {-# LANGUAGE TupleSections #-}
 --- ** exports
 module Hledger.Read.RulesReader (
@ -67,10 +68,10 @@ import qualified Data.Text as T
 import qualified Data.Text.Encoding as T
 import qualified Data.Text.IO as T
 import Data.Time ( Day, TimeZone, UTCTime, LocalTime, ZonedTime(ZonedTime),
-  defaultTimeLocale, getCurrentTimeZone, localDay, parseTimeM, utcToLocalTime, localTimeToUTC, zonedTimeToUTC)
+  defaultTimeLocale, getCurrentTimeZone, localDay, parseTimeM, utcToLocalTime, localTimeToUTC, zonedTimeToUTC, utctDay)
-import Safe (atMay, headMay, lastMay, readMay)
+import Safe (atMay, headMay, lastMay, readMay, headDef)
-import System.FilePath ((</>), takeDirectory, takeExtension, stripExtension, takeFileName)
+import System.FilePath ((</>), takeDirectory, takeExtension, stripExtension, takeFileName, takeBaseName, (<.>))
-import System.IO       (Handle, hClose)
+import System.IO       (Handle, hClose, hPutStrLn, stderr)
 import qualified Data.Csv as Cassava
 import qualified Data.Csv.Parser.Megaparsec as CassavaMegaparsec
 import qualified Data.ByteString as B
@ -84,7 +85,7 @@ import Hledger.Data
 import Hledger.Utils
 import Hledger.Read.Common (aliasesFromOpts, Reader(..), InputOpts(..), amountp, statusp, journalFinalise, accountnamep, transactioncommentp, postingcommentp )
 import Hledger.Write.Csv
-import System.Directory (doesFileExist, getHomeDirectory)
+import System.Directory (doesFileExist, getHomeDirectory, renameFile, getModificationTime, createDirectoryIfMissing)
 import Data.Either (fromRight)
 import Control.DeepSeq (deepseq)
@ -110,41 +111,64 @@ getDownloadDir = do
  home <- getHomeDirectory
  return $ home </> "Downloads"  -- XXX
-- | Parse and post-process a "Journal" from the given rules file path, or give an error.
+-- | Read, parse and post-process a "Journal" from the given rules file, or give an error.
-- A data file is inferred from the @source@ rule, otherwise from a similarly-named file
+--
 -- in the same directory.
 -- The source rule can specify a glob pattern and supports ~ for home directory.
 -- If it is a bare filename it will be relative to the defaut download directory
 -- on this system. If is a relative file path it will be relative to the rules
 -- file's directory. When a glob pattern matches multiple files, the alphabetically
 -- last is used. (Eg in case of multiple numbered downloads, the highest-numbered
 -- will be used.)
 -- The provided handle, or a --rules option, are ignored by this reader.
-- Balance assertions are not checked.
+-- A data file is inferred from the @source@ rule, otherwise from a similarly-named file in the same directory.
 -- The source rule supports ~ for home directory.
 -- If it is a bare filename, its directory is assumed to be ~/Downloads.
 -- If is a relative file path, it is assumed to be relative to the rules file's directory.
 -- The source rule can specify a glob pattern.
 -- If the glob pattern matches multiple files, the newest (last modified) file is used,
 -- unless the import command is running and archiving is enabled, in which case the oldest file is used.
 -- When the import command is running and archiving is enabled, after a successful read
 -- the data file is archived in an archive directory (data/ next to the rules file, auto-created).
 -- Balance assertions are not checked by this reader.
 --
 parse :: InputOpts -> FilePath -> Handle -> ExceptT String IO Journal
-parse iopts f h = do
+parse iopts rulesfile h = do
-  lift $ hClose h -- We don't need it
+  lift $ hClose h -- We don't need it (XXX why ?)
-  rules <- readRulesFile $ dbg4 "reading rules file" f
+
  -- XXX higher-than usual debug level for file reading to bypass excessive noise from elsewhere, normally 6 or 7
  rules <- readRulesFile $ dbg4 "reading rules file" rulesfile
  let
    -- XXX How can we know when the command is import, and if it's a dry run ? In a hacky way, currently.
    args = progArgs
    cmd = headDef "" $ dropWhile ((=="-").take 1) args
    importcmd = cmd `elem` ["import", "imp"]
    dryrun = any (`elem` args) ["--dry-run", "--dry"]
    importing = importcmd && not dryrun
    archiving = importing && isJust (getDirective "import" rules)
    rulesdir = takeDirectory rulesfile
    archivedir = rulesdir </> "data"
  mdatafile <- liftIO $ do
-    dldir <- getDownloadDir
+    dldir <- getDownloadDir  -- look here for the data file if it's specified without a directory
    let rulesdir = takeDirectory f
    let msource = T.unpack <$> getDirective "source" rules
-    fs <- case msource of
+    -- WISH: when not importing, and the source rule matches no files, read the latest archived file 
-            Just src -> expandGlob dir (dbg4 "source" src) >>= sortByModTime <&> dbg4 ("matched files"<>desc<>", newest first")
+    datafiles <- case msource of
-              where (dir,desc) = if isFileName src then (dldir," in download directory") else (rulesdir,"")
+            Just glb -> expandGlob dir (dbg4 "source" glb) >>= sortByModTime <&> dbg4 ("matched files"<>desc<>", newest first")
-            Nothing  -> return [maybe err (dbg4 "inferred source") $ dataFileFor f]  -- shouldn't fail, f has .rules extension
+              where (dir,desc) = if isFileName glb then (dldir," in download directory") else (rulesdir,"")
-              where err = error' $ "could not infer a data file for " <> f
+            Nothing  -> return [maybe err (dbg4 "inferred source") $ dataFileFor rulesfile]  -- shouldn't fail, f has .rules extension
-    return $ dbg4 "data file" $ headMay fs
+              where err = error' $ "could not infer a data file for " <> rulesfile
    return $ case datafiles of
      []                          -> Nothing
      [f] | importcmd             -> dbg4 "importing"             <$> Just f
      [f]                         -> dbg4 "reading"               <$> Just f
      fs | importcmd && archiving -> dbg4 "importing oldest file" <$> headMay fs
      fs | importcmd              -> dbg4 "importing newest file" <$> lastMay fs
      fs                          -> dbg4 "reading newest file"   <$> lastMay fs
  case mdatafile of
    Nothing -> return nulljournal  -- data file specified by source rule was not found
-    Just dat -> do
+    Just datafile -> do
-      exists <- liftIO $ doesFileExist dat
+      exists <- liftIO $ doesFileExist datafile
-      if not (dat=="-" || exists)
+      if not (datafile=="-" || exists)
      then return nulljournal      -- data file inferred from rules file name was not found
      else do
-        dath <- liftIO $ openFileOrStdin dat
+        datafileh <- liftIO $ openFileOrStdin datafile
-        readJournalFromCsv (Just $ Left rules) dat dath Nothing
+        readJournalFromCsv (Just $ Left rules) datafile datafileh Nothing
        -- apply any command line account aliases. Can fail with a bad replacement pattern.
        >>= liftEither . journalApplyAliases (aliasesFromOpts iopts)
            -- journalFinalise assumes the journal's items are
@ -152,7 +176,24 @@ parse iopts f h = do
            -- But here they are already properly ordered. So we'd
            -- better preemptively reverse them once more. XXX inefficient
            . journalReverse
-        >>= journalFinalise iopts{balancingopts_=(balancingopts_ iopts){ignore_assertions_=True}} f ""
+        >>= journalFinalise iopts{balancingopts_=(balancingopts_ iopts){ignore_assertions_=True}} rulesfile ""
        >>= \j -> do
          when archiving $ liftIO $ archiveTo datafile archivedir
          return j
 -- | Move a file to the given directory, creating the directory (and parents) if needed,
 -- showing informational output on stderr.
 archiveTo :: FilePath -> FilePath -> IO ()
 archiveTo datafile archivedir = do
  createDirectoryIfMissing True archivedir
  hPutStrLn stderr $ "archiving " <> datafile
  datafilemodtime <- getModificationTime datafile
  let
    archivefilename = takeBaseName datafile <.> datafilemoddate <.> takeExtension datafile
      where datafilemoddate = show $ utctDay datafilemodtime
    archivefile = archivedir </> archivefilename
  hPutStrLn stderr $ " as " <> archivefile
  renameFile datafile archivefile
 --- ** reading rules files
 --- *** rules utilities
@ -392,10 +433,12 @@ Grammar for the CSV conversion rules, more or less:
 RULES: RULE*
-RULE: ( SOURCE | FIELD-LIST | FIELD-ASSIGNMENT | CONDITIONAL-BLOCK | SKIP | TIMEZONE | NEWEST-FIRST | INTRA-DAY-REVERSED | DATE-FORMAT | DECIMAL-MARK | COMMENT | BLANK ) NEWLINE
+RULE: ( SOURCE | ARCHIVE | FIELD-LIST | FIELD-ASSIGNMENT | CONDITIONAL-BLOCK | SKIP | TIMEZONE | NEWEST-FIRST | INTRA-DAY-REVERSED | DATE-FORMAT | DECIMAL-MARK | COMMENT | BLANK ) NEWLINE
 SOURCE: source SPACE FILEPATH
 ARCHIVE: archive
 FIELD-LIST: fields SPACE FIELD-NAME ( SPACE? , SPACE? FIELD-NAME )*
 FIELD-NAME: QUOTED-FIELD-NAME | BARE-FIELD-NAME
@ -518,6 +561,7 @@ directivep = (do
 directives :: [Text]
 directives =
  ["source"
  ,"archive"
  ,"encoding"
  ,"date-format"
  ,"decimal-mark"
--- a/hledger-lib/Hledger/Utils/IO.hs
+++ b/hledger-lib/Hledger/Utils/IO.hs
@ -131,7 +131,6 @@ import           Data.FileEmbed (makeRelativeToProject, embedStringFile)
 import           Data.Functor ((<&>))
 import           Data.List hiding (uncons)
 import           Data.Maybe (isJust, catMaybes)
 import           Data.Ord (comparing, Down (Down))
 import qualified Data.Text as T
 import           Data.Text.Encoding.Error (UnicodeException)
 import qualified Data.Text.IO as T
@ -415,11 +414,11 @@ expandPath curdir p = (if isRelative p then (curdir </>) else id) <$> expandHome
 expandGlob :: FilePath -> FilePath -> IO [FilePath]
 expandGlob curdir p = expandPath curdir p >>= glob <&> sort  -- PARTIAL:
-- | Given a list of existing file paths, sort them by modification time, most recent first.
+-- | Given a list of existing file paths, sort them by modification time (from oldest to newest).
 sortByModTime :: [FilePath] -> IO [FilePath]
 sortByModTime fs = do
  ftimes <- forM fs $ \f -> do {t <- getModificationTime f; return (t,f)}
-  return $ map snd $ sortBy (comparing Data.Ord.Down) ftimes
+  return $ map snd $ sort ftimes
 -- | Like readFilePortably, but read all of the file before proceeding.
 readFileStrictly :: FilePath -> IO T.Text
--- a/hledger/Hledger/Cli/Commands/Import.md
+++ b/hledger/Hledger/Cli/Commands/Import.md
@ -26,7 +26,7 @@ $ hledger import bank1-checking.csv bank1-savings.csv
 $ hledger import *.csv
 ```
-### Import preview
+### Import dry run
 It's useful to preview the import by running first with `--dry-run`,
 to sanity check the range of dates being imported,
@ -160,28 +160,47 @@ as declared by [`commodity` directives](#commodity-directive) or inferred from t
 Related: [CSV > Amount decimal places](#amount-decimal-places).
 ### Import archiving
 When importing from a CSV rules file (`hledger import bank.rules`),
 you can use the [archive rule](#archive) to enable automatic archiving of the data file.
 After a successful import, the data file (specified by `source`) will be moved
 to an archive folder (`data/`, next to the rules file, auto-created),
 and renamed similar to the rules file, with a date.
 This can be useful for troubleshooting, detecting variations in your banks' CSV data,
 regenerating entries with improved rules, etc.
 The `archive` rule also causes `import` to handle `source` glob patterns differently:
 when there are multiple matched files, it will pick the oldest, not the newest.
 ### Import special cases
-If you have a download whose file name varies, you could rename it to a fixed name after each download.
+#### Deduplication
 Or you could use a [CSV `source` rule](#source) with a suitable glob pattern,
 and import [from the .rules file](#reading-files-specified-by-rule) instead of the data file.
 Here's a situation where you would need to run `import` with care:
 say you download `bank.csv`, but forget to import it or delete it.
 And next month you download it again. This time your web browser may save it as `bank (2).csv`.
 So now each of these may have data not included in the other.
 And a `source` rule with a glob pattern would match only the most recent file.
 So in this case you should import from each one in turn, in the correct order, taking care to use the same filename each time:
 ```cli
 $ hledger import bank.csv
 $ mv 'bank (2).csv' bank.csv
 $ hledger import bank.csv
 ```
 Here are two kinds of "deduplication" which `import` does not handle
-(and generally should not, since these can happen legitimately in financial data):
+(and should not, because these can happen legitimately in financial data):
 - Two or more of the new CSV records are identical, and generate identical new journal entries.
 - A new CSV record generates a journal entry identical to one(s) already in the journal.
 #### Varying file name
 If you have a download whose file name varies, you could rename it to a fixed name after each download.
 Or you could use a [CSV `source` rule](#source) with a suitable glob pattern,
 and import [from the .rules file](#reading-files-specified-by-rule).
 #### Multiple versions
 Say you download `bank.csv`, import it, but forget to delete it from your downloads folder.
 The next time you download it, your web browser will save it as (eg) `bank (2).csv`.
 The [source rule](#source)'s glob patterns are for just this situation:
 instead of specifying `source bank.csv`, specify `source bank*.csv`.
 Then `hledger -f bank.rules CMD` or `hledger import bank.rules` 
 will automatically pick the newest matched file (`bank (2).csv`).
 Alternately, what if you download, but forget to import or delete, then download again ?
 Now each of `bank.csv` and `bank (2).csv` might contain data that's not in the other, and not in your journal.
 In this case, it's best to import each of them in turn, oldest first
 (otherwise, overlap detection could cause new records to be skipped).
 Enabling [import archiving](import-archiving) ensures this.
 Then `hledger import bank.rules; hledger import bank.rules` will import and archive first `bank.csv`, then `bank (2).csv`.
--- a/hledger/hledger.m4.md
+++ b/hledger/hledger.m4.md
@ -3280,15 +3280,33 @@ in your system's downloads directory (`~/Downloads`, currently):
 source Checking1.csv
 ```
-And if you specify a glob pattern, hledger will read the most recent of the matched files
+And if you specify a glob pattern, hledger will read the newest (most recently modified) of the matched files,
-(useful with repeated downloads):
+which is useful eg if your browser has saved multiple versions of a download:
 ```rules
 source Checking1*.csv
 ```
 This enables a convenient workflow where you just download CSV files to the default place, then run `hledger import rules/*`.
 Once they have been imported, you can discard them or ignore them.
 See also ["Working with CSV > Reading files specified by rule"](#reading-files-specified-by-rule).
 ## `archive`
 The `archive` rule can be used together with `source` to make importing a little more convenient.
 It affects only the [import](#import) command. When enabled,
 - `import` will process multiple `source` glob matches oldest first.
   So if you have multiple versions of a download, repeated imports will process them in chronological order.
 - After successfully importing a `source`-specified file, 
  `import` will move it to an archive directory (`data/` next to the rules file, auto-created),
  and rename it to `RULESFILENAME.MODIFICATIONDATE.DOWNLOADEXT`.
 Archiving imported files in this way is completely optional, but it can be useful for troubleshooting,
 detecting variations in your banks' CSV data, regenerating entries with improved rules, etc.
 ## `encoding`
 ```rules