feat:import:archive: archive data files, and process oldest first
This commit is contained in:
parent
db7783b16d
commit
76dc6d089a
@ -22,6 +22,7 @@ Most of the code for reading rules files and csv files is in this module.
|
||||
{-# LANGUAGE ViewPatterns #-}
|
||||
{-# OPTIONS_GHC -Wno-unrecognised-pragmas #-}
|
||||
{-# LANGUAGE LambdaCase #-}
|
||||
{-# LANGUAGE TupleSections #-}
|
||||
|
||||
--- ** exports
|
||||
module Hledger.Read.RulesReader (
|
||||
@ -67,10 +68,10 @@ import qualified Data.Text as T
|
||||
import qualified Data.Text.Encoding as T
|
||||
import qualified Data.Text.IO as T
|
||||
import Data.Time ( Day, TimeZone, UTCTime, LocalTime, ZonedTime(ZonedTime),
|
||||
defaultTimeLocale, getCurrentTimeZone, localDay, parseTimeM, utcToLocalTime, localTimeToUTC, zonedTimeToUTC)
|
||||
import Safe (atMay, headMay, lastMay, readMay)
|
||||
import System.FilePath ((</>), takeDirectory, takeExtension, stripExtension, takeFileName)
|
||||
import System.IO (Handle, hClose)
|
||||
defaultTimeLocale, getCurrentTimeZone, localDay, parseTimeM, utcToLocalTime, localTimeToUTC, zonedTimeToUTC, utctDay)
|
||||
import Safe (atMay, headMay, lastMay, readMay, headDef)
|
||||
import System.FilePath ((</>), takeDirectory, takeExtension, stripExtension, takeFileName, takeBaseName, (<.>))
|
||||
import System.IO (Handle, hClose, hPutStrLn, stderr)
|
||||
import qualified Data.Csv as Cassava
|
||||
import qualified Data.Csv.Parser.Megaparsec as CassavaMegaparsec
|
||||
import qualified Data.ByteString as B
|
||||
@ -84,7 +85,7 @@ import Hledger.Data
|
||||
import Hledger.Utils
|
||||
import Hledger.Read.Common (aliasesFromOpts, Reader(..), InputOpts(..), amountp, statusp, journalFinalise, accountnamep, transactioncommentp, postingcommentp )
|
||||
import Hledger.Write.Csv
|
||||
import System.Directory (doesFileExist, getHomeDirectory)
|
||||
import System.Directory (doesFileExist, getHomeDirectory, renameFile, getModificationTime, createDirectoryIfMissing)
|
||||
import Data.Either (fromRight)
|
||||
import Control.DeepSeq (deepseq)
|
||||
|
||||
@ -110,41 +111,64 @@ getDownloadDir = do
|
||||
home <- getHomeDirectory
|
||||
return $ home </> "Downloads" -- XXX
|
||||
|
||||
-- | Parse and post-process a "Journal" from the given rules file path, or give an error.
|
||||
-- A data file is inferred from the @source@ rule, otherwise from a similarly-named file
|
||||
-- in the same directory.
|
||||
-- The source rule can specify a glob pattern and supports ~ for home directory.
|
||||
-- If it is a bare filename it will be relative to the defaut download directory
|
||||
-- on this system. If is a relative file path it will be relative to the rules
|
||||
-- file's directory. When a glob pattern matches multiple files, the alphabetically
|
||||
-- last is used. (Eg in case of multiple numbered downloads, the highest-numbered
|
||||
-- will be used.)
|
||||
-- | Read, parse and post-process a "Journal" from the given rules file, or give an error.
|
||||
--
|
||||
-- The provided handle, or a --rules option, are ignored by this reader.
|
||||
-- Balance assertions are not checked.
|
||||
-- A data file is inferred from the @source@ rule, otherwise from a similarly-named file in the same directory.
|
||||
-- The source rule supports ~ for home directory.
|
||||
-- If it is a bare filename, its directory is assumed to be ~/Downloads.
|
||||
-- If is a relative file path, it is assumed to be relative to the rules file's directory.
|
||||
-- The source rule can specify a glob pattern.
|
||||
-- If the glob pattern matches multiple files, the newest (last modified) file is used,
|
||||
-- unless the import command is running and archiving is enabled, in which case the oldest file is used.
|
||||
-- When the import command is running and archiving is enabled, after a successful read
|
||||
-- the data file is archived in an archive directory (data/ next to the rules file, auto-created).
|
||||
-- Balance assertions are not checked by this reader.
|
||||
--
|
||||
parse :: InputOpts -> FilePath -> Handle -> ExceptT String IO Journal
|
||||
parse iopts f h = do
|
||||
lift $ hClose h -- We don't need it
|
||||
rules <- readRulesFile $ dbg4 "reading rules file" f
|
||||
parse iopts rulesfile h = do
|
||||
lift $ hClose h -- We don't need it (XXX why ?)
|
||||
|
||||
-- XXX higher-than usual debug level for file reading to bypass excessive noise from elsewhere, normally 6 or 7
|
||||
rules <- readRulesFile $ dbg4 "reading rules file" rulesfile
|
||||
|
||||
let
|
||||
-- XXX How can we know when the command is import, and if it's a dry run ? In a hacky way, currently.
|
||||
args = progArgs
|
||||
cmd = headDef "" $ dropWhile ((=="-").take 1) args
|
||||
importcmd = cmd `elem` ["import", "imp"]
|
||||
dryrun = any (`elem` args) ["--dry-run", "--dry"]
|
||||
importing = importcmd && not dryrun
|
||||
archiving = importing && isJust (getDirective "import" rules)
|
||||
rulesdir = takeDirectory rulesfile
|
||||
archivedir = rulesdir </> "data"
|
||||
|
||||
mdatafile <- liftIO $ do
|
||||
dldir <- getDownloadDir
|
||||
let rulesdir = takeDirectory f
|
||||
dldir <- getDownloadDir -- look here for the data file if it's specified without a directory
|
||||
let msource = T.unpack <$> getDirective "source" rules
|
||||
fs <- case msource of
|
||||
Just src -> expandGlob dir (dbg4 "source" src) >>= sortByModTime <&> dbg4 ("matched files"<>desc<>", newest first")
|
||||
where (dir,desc) = if isFileName src then (dldir," in download directory") else (rulesdir,"")
|
||||
Nothing -> return [maybe err (dbg4 "inferred source") $ dataFileFor f] -- shouldn't fail, f has .rules extension
|
||||
where err = error' $ "could not infer a data file for " <> f
|
||||
return $ dbg4 "data file" $ headMay fs
|
||||
-- WISH: when not importing, and the source rule matches no files, read the latest archived file
|
||||
datafiles <- case msource of
|
||||
Just glb -> expandGlob dir (dbg4 "source" glb) >>= sortByModTime <&> dbg4 ("matched files"<>desc<>", newest first")
|
||||
where (dir,desc) = if isFileName glb then (dldir," in download directory") else (rulesdir,"")
|
||||
Nothing -> return [maybe err (dbg4 "inferred source") $ dataFileFor rulesfile] -- shouldn't fail, f has .rules extension
|
||||
where err = error' $ "could not infer a data file for " <> rulesfile
|
||||
return $ case datafiles of
|
||||
[] -> Nothing
|
||||
[f] | importcmd -> dbg4 "importing" <$> Just f
|
||||
[f] -> dbg4 "reading" <$> Just f
|
||||
fs | importcmd && archiving -> dbg4 "importing oldest file" <$> headMay fs
|
||||
fs | importcmd -> dbg4 "importing newest file" <$> lastMay fs
|
||||
fs -> dbg4 "reading newest file" <$> lastMay fs
|
||||
|
||||
case mdatafile of
|
||||
Nothing -> return nulljournal -- data file specified by source rule was not found
|
||||
Just dat -> do
|
||||
exists <- liftIO $ doesFileExist dat
|
||||
if not (dat=="-" || exists)
|
||||
Just datafile -> do
|
||||
exists <- liftIO $ doesFileExist datafile
|
||||
if not (datafile=="-" || exists)
|
||||
then return nulljournal -- data file inferred from rules file name was not found
|
||||
else do
|
||||
dath <- liftIO $ openFileOrStdin dat
|
||||
readJournalFromCsv (Just $ Left rules) dat dath Nothing
|
||||
datafileh <- liftIO $ openFileOrStdin datafile
|
||||
readJournalFromCsv (Just $ Left rules) datafile datafileh Nothing
|
||||
-- apply any command line account aliases. Can fail with a bad replacement pattern.
|
||||
>>= liftEither . journalApplyAliases (aliasesFromOpts iopts)
|
||||
-- journalFinalise assumes the journal's items are
|
||||
@ -152,7 +176,24 @@ parse iopts f h = do
|
||||
-- But here they are already properly ordered. So we'd
|
||||
-- better preemptively reverse them once more. XXX inefficient
|
||||
. journalReverse
|
||||
>>= journalFinalise iopts{balancingopts_=(balancingopts_ iopts){ignore_assertions_=True}} f ""
|
||||
>>= journalFinalise iopts{balancingopts_=(balancingopts_ iopts){ignore_assertions_=True}} rulesfile ""
|
||||
>>= \j -> do
|
||||
when archiving $ liftIO $ archiveTo datafile archivedir
|
||||
return j
|
||||
|
||||
-- | Move a file to the given directory, creating the directory (and parents) if needed,
|
||||
-- showing informational output on stderr.
|
||||
archiveTo :: FilePath -> FilePath -> IO ()
|
||||
archiveTo datafile archivedir = do
|
||||
createDirectoryIfMissing True archivedir
|
||||
hPutStrLn stderr $ "archiving " <> datafile
|
||||
datafilemodtime <- getModificationTime datafile
|
||||
let
|
||||
archivefilename = takeBaseName datafile <.> datafilemoddate <.> takeExtension datafile
|
||||
where datafilemoddate = show $ utctDay datafilemodtime
|
||||
archivefile = archivedir </> archivefilename
|
||||
hPutStrLn stderr $ " as " <> archivefile
|
||||
renameFile datafile archivefile
|
||||
|
||||
--- ** reading rules files
|
||||
--- *** rules utilities
|
||||
@ -392,10 +433,12 @@ Grammar for the CSV conversion rules, more or less:
|
||||
|
||||
RULES: RULE*
|
||||
|
||||
RULE: ( SOURCE | FIELD-LIST | FIELD-ASSIGNMENT | CONDITIONAL-BLOCK | SKIP | TIMEZONE | NEWEST-FIRST | INTRA-DAY-REVERSED | DATE-FORMAT | DECIMAL-MARK | COMMENT | BLANK ) NEWLINE
|
||||
RULE: ( SOURCE | ARCHIVE | FIELD-LIST | FIELD-ASSIGNMENT | CONDITIONAL-BLOCK | SKIP | TIMEZONE | NEWEST-FIRST | INTRA-DAY-REVERSED | DATE-FORMAT | DECIMAL-MARK | COMMENT | BLANK ) NEWLINE
|
||||
|
||||
SOURCE: source SPACE FILEPATH
|
||||
|
||||
ARCHIVE: archive
|
||||
|
||||
FIELD-LIST: fields SPACE FIELD-NAME ( SPACE? , SPACE? FIELD-NAME )*
|
||||
|
||||
FIELD-NAME: QUOTED-FIELD-NAME | BARE-FIELD-NAME
|
||||
@ -518,6 +561,7 @@ directivep = (do
|
||||
directives :: [Text]
|
||||
directives =
|
||||
["source"
|
||||
,"archive"
|
||||
,"encoding"
|
||||
,"date-format"
|
||||
,"decimal-mark"
|
||||
|
||||
@ -131,7 +131,6 @@ import Data.FileEmbed (makeRelativeToProject, embedStringFile)
|
||||
import Data.Functor ((<&>))
|
||||
import Data.List hiding (uncons)
|
||||
import Data.Maybe (isJust, catMaybes)
|
||||
import Data.Ord (comparing, Down (Down))
|
||||
import qualified Data.Text as T
|
||||
import Data.Text.Encoding.Error (UnicodeException)
|
||||
import qualified Data.Text.IO as T
|
||||
@ -415,11 +414,11 @@ expandPath curdir p = (if isRelative p then (curdir </>) else id) <$> expandHome
|
||||
expandGlob :: FilePath -> FilePath -> IO [FilePath]
|
||||
expandGlob curdir p = expandPath curdir p >>= glob <&> sort -- PARTIAL:
|
||||
|
||||
-- | Given a list of existing file paths, sort them by modification time, most recent first.
|
||||
-- | Given a list of existing file paths, sort them by modification time (from oldest to newest).
|
||||
sortByModTime :: [FilePath] -> IO [FilePath]
|
||||
sortByModTime fs = do
|
||||
ftimes <- forM fs $ \f -> do {t <- getModificationTime f; return (t,f)}
|
||||
return $ map snd $ sortBy (comparing Data.Ord.Down) ftimes
|
||||
return $ map snd $ sort ftimes
|
||||
|
||||
-- | Like readFilePortably, but read all of the file before proceeding.
|
||||
readFileStrictly :: FilePath -> IO T.Text
|
||||
|
||||
@ -26,7 +26,7 @@ $ hledger import bank1-checking.csv bank1-savings.csv
|
||||
$ hledger import *.csv
|
||||
```
|
||||
|
||||
### Import preview
|
||||
### Import dry run
|
||||
|
||||
It's useful to preview the import by running first with `--dry-run`,
|
||||
to sanity check the range of dates being imported,
|
||||
@ -160,28 +160,47 @@ as declared by [`commodity` directives](#commodity-directive) or inferred from t
|
||||
|
||||
Related: [CSV > Amount decimal places](#amount-decimal-places).
|
||||
|
||||
### Import archiving
|
||||
|
||||
When importing from a CSV rules file (`hledger import bank.rules`),
|
||||
you can use the [archive rule](#archive) to enable automatic archiving of the data file.
|
||||
After a successful import, the data file (specified by `source`) will be moved
|
||||
to an archive folder (`data/`, next to the rules file, auto-created),
|
||||
and renamed similar to the rules file, with a date.
|
||||
This can be useful for troubleshooting, detecting variations in your banks' CSV data,
|
||||
regenerating entries with improved rules, etc.
|
||||
|
||||
The `archive` rule also causes `import` to handle `source` glob patterns differently:
|
||||
when there are multiple matched files, it will pick the oldest, not the newest.
|
||||
|
||||
### Import special cases
|
||||
|
||||
If you have a download whose file name varies, you could rename it to a fixed name after each download.
|
||||
Or you could use a [CSV `source` rule](#source) with a suitable glob pattern,
|
||||
and import [from the .rules file](#reading-files-specified-by-rule) instead of the data file.
|
||||
|
||||
Here's a situation where you would need to run `import` with care:
|
||||
say you download `bank.csv`, but forget to import it or delete it.
|
||||
And next month you download it again. This time your web browser may save it as `bank (2).csv`.
|
||||
So now each of these may have data not included in the other.
|
||||
And a `source` rule with a glob pattern would match only the most recent file.
|
||||
So in this case you should import from each one in turn, in the correct order, taking care to use the same filename each time:
|
||||
|
||||
```cli
|
||||
$ hledger import bank.csv
|
||||
$ mv 'bank (2).csv' bank.csv
|
||||
$ hledger import bank.csv
|
||||
```
|
||||
#### Deduplication
|
||||
|
||||
Here are two kinds of "deduplication" which `import` does not handle
|
||||
(and generally should not, since these can happen legitimately in financial data):
|
||||
(and should not, because these can happen legitimately in financial data):
|
||||
|
||||
- Two or more of the new CSV records are identical, and generate identical new journal entries.
|
||||
- A new CSV record generates a journal entry identical to one(s) already in the journal.
|
||||
|
||||
#### Varying file name
|
||||
|
||||
If you have a download whose file name varies, you could rename it to a fixed name after each download.
|
||||
Or you could use a [CSV `source` rule](#source) with a suitable glob pattern,
|
||||
and import [from the .rules file](#reading-files-specified-by-rule).
|
||||
|
||||
#### Multiple versions
|
||||
|
||||
Say you download `bank.csv`, import it, but forget to delete it from your downloads folder.
|
||||
The next time you download it, your web browser will save it as (eg) `bank (2).csv`.
|
||||
The [source rule](#source)'s glob patterns are for just this situation:
|
||||
instead of specifying `source bank.csv`, specify `source bank*.csv`.
|
||||
Then `hledger -f bank.rules CMD` or `hledger import bank.rules`
|
||||
will automatically pick the newest matched file (`bank (2).csv`).
|
||||
|
||||
Alternately, what if you download, but forget to import or delete, then download again ?
|
||||
Now each of `bank.csv` and `bank (2).csv` might contain data that's not in the other, and not in your journal.
|
||||
In this case, it's best to import each of them in turn, oldest first
|
||||
(otherwise, overlap detection could cause new records to be skipped).
|
||||
Enabling [import archiving](import-archiving) ensures this.
|
||||
Then `hledger import bank.rules; hledger import bank.rules` will import and archive first `bank.csv`, then `bank (2).csv`.
|
||||
|
||||
@ -3280,15 +3280,33 @@ in your system's downloads directory (`~/Downloads`, currently):
|
||||
source Checking1.csv
|
||||
```
|
||||
|
||||
And if you specify a glob pattern, hledger will read the most recent of the matched files
|
||||
(useful with repeated downloads):
|
||||
And if you specify a glob pattern, hledger will read the newest (most recently modified) of the matched files,
|
||||
which is useful eg if your browser has saved multiple versions of a download:
|
||||
|
||||
```rules
|
||||
source Checking1*.csv
|
||||
```
|
||||
|
||||
This enables a convenient workflow where you just download CSV files to the default place, then run `hledger import rules/*`.
|
||||
Once they have been imported, you can discard them or ignore them.
|
||||
|
||||
See also ["Working with CSV > Reading files specified by rule"](#reading-files-specified-by-rule).
|
||||
|
||||
## `archive`
|
||||
|
||||
The `archive` rule can be used together with `source` to make importing a little more convenient.
|
||||
It affects only the [import](#import) command. When enabled,
|
||||
|
||||
- `import` will process multiple `source` glob matches oldest first.
|
||||
So if you have multiple versions of a download, repeated imports will process them in chronological order.
|
||||
|
||||
- After successfully importing a `source`-specified file,
|
||||
`import` will move it to an archive directory (`data/` next to the rules file, auto-created),
|
||||
and rename it to `RULESFILENAME.MODIFICATIONDATE.DOWNLOADEXT`.
|
||||
|
||||
Archiving imported files in this way is completely optional, but it can be useful for troubleshooting,
|
||||
detecting variations in your banks' CSV data, regenerating entries with improved rules, etc.
|
||||
|
||||
## `encoding`
|
||||
|
||||
```rules
|
||||
|
||||
Loading…
Reference in New Issue
Block a user