feat:import:archive: archive data files, and process oldest first
This commit is contained in:
parent
db7783b16d
commit
76dc6d089a
@ -22,6 +22,7 @@ Most of the code for reading rules files and csv files is in this module.
|
|||||||
{-# LANGUAGE ViewPatterns #-}
|
{-# LANGUAGE ViewPatterns #-}
|
||||||
{-# OPTIONS_GHC -Wno-unrecognised-pragmas #-}
|
{-# OPTIONS_GHC -Wno-unrecognised-pragmas #-}
|
||||||
{-# LANGUAGE LambdaCase #-}
|
{-# LANGUAGE LambdaCase #-}
|
||||||
|
{-# LANGUAGE TupleSections #-}
|
||||||
|
|
||||||
--- ** exports
|
--- ** exports
|
||||||
module Hledger.Read.RulesReader (
|
module Hledger.Read.RulesReader (
|
||||||
@ -67,10 +68,10 @@ import qualified Data.Text as T
|
|||||||
import qualified Data.Text.Encoding as T
|
import qualified Data.Text.Encoding as T
|
||||||
import qualified Data.Text.IO as T
|
import qualified Data.Text.IO as T
|
||||||
import Data.Time ( Day, TimeZone, UTCTime, LocalTime, ZonedTime(ZonedTime),
|
import Data.Time ( Day, TimeZone, UTCTime, LocalTime, ZonedTime(ZonedTime),
|
||||||
defaultTimeLocale, getCurrentTimeZone, localDay, parseTimeM, utcToLocalTime, localTimeToUTC, zonedTimeToUTC)
|
defaultTimeLocale, getCurrentTimeZone, localDay, parseTimeM, utcToLocalTime, localTimeToUTC, zonedTimeToUTC, utctDay)
|
||||||
import Safe (atMay, headMay, lastMay, readMay)
|
import Safe (atMay, headMay, lastMay, readMay, headDef)
|
||||||
import System.FilePath ((</>), takeDirectory, takeExtension, stripExtension, takeFileName)
|
import System.FilePath ((</>), takeDirectory, takeExtension, stripExtension, takeFileName, takeBaseName, (<.>))
|
||||||
import System.IO (Handle, hClose)
|
import System.IO (Handle, hClose, hPutStrLn, stderr)
|
||||||
import qualified Data.Csv as Cassava
|
import qualified Data.Csv as Cassava
|
||||||
import qualified Data.Csv.Parser.Megaparsec as CassavaMegaparsec
|
import qualified Data.Csv.Parser.Megaparsec as CassavaMegaparsec
|
||||||
import qualified Data.ByteString as B
|
import qualified Data.ByteString as B
|
||||||
@ -84,7 +85,7 @@ import Hledger.Data
|
|||||||
import Hledger.Utils
|
import Hledger.Utils
|
||||||
import Hledger.Read.Common (aliasesFromOpts, Reader(..), InputOpts(..), amountp, statusp, journalFinalise, accountnamep, transactioncommentp, postingcommentp )
|
import Hledger.Read.Common (aliasesFromOpts, Reader(..), InputOpts(..), amountp, statusp, journalFinalise, accountnamep, transactioncommentp, postingcommentp )
|
||||||
import Hledger.Write.Csv
|
import Hledger.Write.Csv
|
||||||
import System.Directory (doesFileExist, getHomeDirectory)
|
import System.Directory (doesFileExist, getHomeDirectory, renameFile, getModificationTime, createDirectoryIfMissing)
|
||||||
import Data.Either (fromRight)
|
import Data.Either (fromRight)
|
||||||
import Control.DeepSeq (deepseq)
|
import Control.DeepSeq (deepseq)
|
||||||
|
|
||||||
@ -110,41 +111,64 @@ getDownloadDir = do
|
|||||||
home <- getHomeDirectory
|
home <- getHomeDirectory
|
||||||
return $ home </> "Downloads" -- XXX
|
return $ home </> "Downloads" -- XXX
|
||||||
|
|
||||||
-- | Parse and post-process a "Journal" from the given rules file path, or give an error.
|
-- | Read, parse and post-process a "Journal" from the given rules file, or give an error.
|
||||||
-- A data file is inferred from the @source@ rule, otherwise from a similarly-named file
|
--
|
||||||
-- in the same directory.
|
|
||||||
-- The source rule can specify a glob pattern and supports ~ for home directory.
|
|
||||||
-- If it is a bare filename it will be relative to the defaut download directory
|
|
||||||
-- on this system. If is a relative file path it will be relative to the rules
|
|
||||||
-- file's directory. When a glob pattern matches multiple files, the alphabetically
|
|
||||||
-- last is used. (Eg in case of multiple numbered downloads, the highest-numbered
|
|
||||||
-- will be used.)
|
|
||||||
-- The provided handle, or a --rules option, are ignored by this reader.
|
-- The provided handle, or a --rules option, are ignored by this reader.
|
||||||
-- Balance assertions are not checked.
|
-- A data file is inferred from the @source@ rule, otherwise from a similarly-named file in the same directory.
|
||||||
|
-- The source rule supports ~ for home directory.
|
||||||
|
-- If it is a bare filename, its directory is assumed to be ~/Downloads.
|
||||||
|
-- If is a relative file path, it is assumed to be relative to the rules file's directory.
|
||||||
|
-- The source rule can specify a glob pattern.
|
||||||
|
-- If the glob pattern matches multiple files, the newest (last modified) file is used,
|
||||||
|
-- unless the import command is running and archiving is enabled, in which case the oldest file is used.
|
||||||
|
-- When the import command is running and archiving is enabled, after a successful read
|
||||||
|
-- the data file is archived in an archive directory (data/ next to the rules file, auto-created).
|
||||||
|
-- Balance assertions are not checked by this reader.
|
||||||
|
--
|
||||||
parse :: InputOpts -> FilePath -> Handle -> ExceptT String IO Journal
|
parse :: InputOpts -> FilePath -> Handle -> ExceptT String IO Journal
|
||||||
parse iopts f h = do
|
parse iopts rulesfile h = do
|
||||||
lift $ hClose h -- We don't need it
|
lift $ hClose h -- We don't need it (XXX why ?)
|
||||||
rules <- readRulesFile $ dbg4 "reading rules file" f
|
|
||||||
-- XXX higher-than usual debug level for file reading to bypass excessive noise from elsewhere, normally 6 or 7
|
-- XXX higher-than usual debug level for file reading to bypass excessive noise from elsewhere, normally 6 or 7
|
||||||
|
rules <- readRulesFile $ dbg4 "reading rules file" rulesfile
|
||||||
|
|
||||||
|
let
|
||||||
|
-- XXX How can we know when the command is import, and if it's a dry run ? In a hacky way, currently.
|
||||||
|
args = progArgs
|
||||||
|
cmd = headDef "" $ dropWhile ((=="-").take 1) args
|
||||||
|
importcmd = cmd `elem` ["import", "imp"]
|
||||||
|
dryrun = any (`elem` args) ["--dry-run", "--dry"]
|
||||||
|
importing = importcmd && not dryrun
|
||||||
|
archiving = importing && isJust (getDirective "import" rules)
|
||||||
|
rulesdir = takeDirectory rulesfile
|
||||||
|
archivedir = rulesdir </> "data"
|
||||||
|
|
||||||
mdatafile <- liftIO $ do
|
mdatafile <- liftIO $ do
|
||||||
dldir <- getDownloadDir
|
dldir <- getDownloadDir -- look here for the data file if it's specified without a directory
|
||||||
let rulesdir = takeDirectory f
|
|
||||||
let msource = T.unpack <$> getDirective "source" rules
|
let msource = T.unpack <$> getDirective "source" rules
|
||||||
fs <- case msource of
|
-- WISH: when not importing, and the source rule matches no files, read the latest archived file
|
||||||
Just src -> expandGlob dir (dbg4 "source" src) >>= sortByModTime <&> dbg4 ("matched files"<>desc<>", newest first")
|
datafiles <- case msource of
|
||||||
where (dir,desc) = if isFileName src then (dldir," in download directory") else (rulesdir,"")
|
Just glb -> expandGlob dir (dbg4 "source" glb) >>= sortByModTime <&> dbg4 ("matched files"<>desc<>", newest first")
|
||||||
Nothing -> return [maybe err (dbg4 "inferred source") $ dataFileFor f] -- shouldn't fail, f has .rules extension
|
where (dir,desc) = if isFileName glb then (dldir," in download directory") else (rulesdir,"")
|
||||||
where err = error' $ "could not infer a data file for " <> f
|
Nothing -> return [maybe err (dbg4 "inferred source") $ dataFileFor rulesfile] -- shouldn't fail, f has .rules extension
|
||||||
return $ dbg4 "data file" $ headMay fs
|
where err = error' $ "could not infer a data file for " <> rulesfile
|
||||||
|
return $ case datafiles of
|
||||||
|
[] -> Nothing
|
||||||
|
[f] | importcmd -> dbg4 "importing" <$> Just f
|
||||||
|
[f] -> dbg4 "reading" <$> Just f
|
||||||
|
fs | importcmd && archiving -> dbg4 "importing oldest file" <$> headMay fs
|
||||||
|
fs | importcmd -> dbg4 "importing newest file" <$> lastMay fs
|
||||||
|
fs -> dbg4 "reading newest file" <$> lastMay fs
|
||||||
|
|
||||||
case mdatafile of
|
case mdatafile of
|
||||||
Nothing -> return nulljournal -- data file specified by source rule was not found
|
Nothing -> return nulljournal -- data file specified by source rule was not found
|
||||||
Just dat -> do
|
Just datafile -> do
|
||||||
exists <- liftIO $ doesFileExist dat
|
exists <- liftIO $ doesFileExist datafile
|
||||||
if not (dat=="-" || exists)
|
if not (datafile=="-" || exists)
|
||||||
then return nulljournal -- data file inferred from rules file name was not found
|
then return nulljournal -- data file inferred from rules file name was not found
|
||||||
else do
|
else do
|
||||||
dath <- liftIO $ openFileOrStdin dat
|
datafileh <- liftIO $ openFileOrStdin datafile
|
||||||
readJournalFromCsv (Just $ Left rules) dat dath Nothing
|
readJournalFromCsv (Just $ Left rules) datafile datafileh Nothing
|
||||||
-- apply any command line account aliases. Can fail with a bad replacement pattern.
|
-- apply any command line account aliases. Can fail with a bad replacement pattern.
|
||||||
>>= liftEither . journalApplyAliases (aliasesFromOpts iopts)
|
>>= liftEither . journalApplyAliases (aliasesFromOpts iopts)
|
||||||
-- journalFinalise assumes the journal's items are
|
-- journalFinalise assumes the journal's items are
|
||||||
@ -152,7 +176,24 @@ parse iopts f h = do
|
|||||||
-- But here they are already properly ordered. So we'd
|
-- But here they are already properly ordered. So we'd
|
||||||
-- better preemptively reverse them once more. XXX inefficient
|
-- better preemptively reverse them once more. XXX inefficient
|
||||||
. journalReverse
|
. journalReverse
|
||||||
>>= journalFinalise iopts{balancingopts_=(balancingopts_ iopts){ignore_assertions_=True}} f ""
|
>>= journalFinalise iopts{balancingopts_=(balancingopts_ iopts){ignore_assertions_=True}} rulesfile ""
|
||||||
|
>>= \j -> do
|
||||||
|
when archiving $ liftIO $ archiveTo datafile archivedir
|
||||||
|
return j
|
||||||
|
|
||||||
|
-- | Move a file to the given directory, creating the directory (and parents) if needed,
|
||||||
|
-- showing informational output on stderr.
|
||||||
|
archiveTo :: FilePath -> FilePath -> IO ()
|
||||||
|
archiveTo datafile archivedir = do
|
||||||
|
createDirectoryIfMissing True archivedir
|
||||||
|
hPutStrLn stderr $ "archiving " <> datafile
|
||||||
|
datafilemodtime <- getModificationTime datafile
|
||||||
|
let
|
||||||
|
archivefilename = takeBaseName datafile <.> datafilemoddate <.> takeExtension datafile
|
||||||
|
where datafilemoddate = show $ utctDay datafilemodtime
|
||||||
|
archivefile = archivedir </> archivefilename
|
||||||
|
hPutStrLn stderr $ " as " <> archivefile
|
||||||
|
renameFile datafile archivefile
|
||||||
|
|
||||||
--- ** reading rules files
|
--- ** reading rules files
|
||||||
--- *** rules utilities
|
--- *** rules utilities
|
||||||
@ -392,10 +433,12 @@ Grammar for the CSV conversion rules, more or less:
|
|||||||
|
|
||||||
RULES: RULE*
|
RULES: RULE*
|
||||||
|
|
||||||
RULE: ( SOURCE | FIELD-LIST | FIELD-ASSIGNMENT | CONDITIONAL-BLOCK | SKIP | TIMEZONE | NEWEST-FIRST | INTRA-DAY-REVERSED | DATE-FORMAT | DECIMAL-MARK | COMMENT | BLANK ) NEWLINE
|
RULE: ( SOURCE | ARCHIVE | FIELD-LIST | FIELD-ASSIGNMENT | CONDITIONAL-BLOCK | SKIP | TIMEZONE | NEWEST-FIRST | INTRA-DAY-REVERSED | DATE-FORMAT | DECIMAL-MARK | COMMENT | BLANK ) NEWLINE
|
||||||
|
|
||||||
SOURCE: source SPACE FILEPATH
|
SOURCE: source SPACE FILEPATH
|
||||||
|
|
||||||
|
ARCHIVE: archive
|
||||||
|
|
||||||
FIELD-LIST: fields SPACE FIELD-NAME ( SPACE? , SPACE? FIELD-NAME )*
|
FIELD-LIST: fields SPACE FIELD-NAME ( SPACE? , SPACE? FIELD-NAME )*
|
||||||
|
|
||||||
FIELD-NAME: QUOTED-FIELD-NAME | BARE-FIELD-NAME
|
FIELD-NAME: QUOTED-FIELD-NAME | BARE-FIELD-NAME
|
||||||
@ -518,6 +561,7 @@ directivep = (do
|
|||||||
directives :: [Text]
|
directives :: [Text]
|
||||||
directives =
|
directives =
|
||||||
["source"
|
["source"
|
||||||
|
,"archive"
|
||||||
,"encoding"
|
,"encoding"
|
||||||
,"date-format"
|
,"date-format"
|
||||||
,"decimal-mark"
|
,"decimal-mark"
|
||||||
|
|||||||
@ -131,7 +131,6 @@ import Data.FileEmbed (makeRelativeToProject, embedStringFile)
|
|||||||
import Data.Functor ((<&>))
|
import Data.Functor ((<&>))
|
||||||
import Data.List hiding (uncons)
|
import Data.List hiding (uncons)
|
||||||
import Data.Maybe (isJust, catMaybes)
|
import Data.Maybe (isJust, catMaybes)
|
||||||
import Data.Ord (comparing, Down (Down))
|
|
||||||
import qualified Data.Text as T
|
import qualified Data.Text as T
|
||||||
import Data.Text.Encoding.Error (UnicodeException)
|
import Data.Text.Encoding.Error (UnicodeException)
|
||||||
import qualified Data.Text.IO as T
|
import qualified Data.Text.IO as T
|
||||||
@ -415,11 +414,11 @@ expandPath curdir p = (if isRelative p then (curdir </>) else id) <$> expandHome
|
|||||||
expandGlob :: FilePath -> FilePath -> IO [FilePath]
|
expandGlob :: FilePath -> FilePath -> IO [FilePath]
|
||||||
expandGlob curdir p = expandPath curdir p >>= glob <&> sort -- PARTIAL:
|
expandGlob curdir p = expandPath curdir p >>= glob <&> sort -- PARTIAL:
|
||||||
|
|
||||||
-- | Given a list of existing file paths, sort them by modification time, most recent first.
|
-- | Given a list of existing file paths, sort them by modification time (from oldest to newest).
|
||||||
sortByModTime :: [FilePath] -> IO [FilePath]
|
sortByModTime :: [FilePath] -> IO [FilePath]
|
||||||
sortByModTime fs = do
|
sortByModTime fs = do
|
||||||
ftimes <- forM fs $ \f -> do {t <- getModificationTime f; return (t,f)}
|
ftimes <- forM fs $ \f -> do {t <- getModificationTime f; return (t,f)}
|
||||||
return $ map snd $ sortBy (comparing Data.Ord.Down) ftimes
|
return $ map snd $ sort ftimes
|
||||||
|
|
||||||
-- | Like readFilePortably, but read all of the file before proceeding.
|
-- | Like readFilePortably, but read all of the file before proceeding.
|
||||||
readFileStrictly :: FilePath -> IO T.Text
|
readFileStrictly :: FilePath -> IO T.Text
|
||||||
|
|||||||
@ -26,7 +26,7 @@ $ hledger import bank1-checking.csv bank1-savings.csv
|
|||||||
$ hledger import *.csv
|
$ hledger import *.csv
|
||||||
```
|
```
|
||||||
|
|
||||||
### Import preview
|
### Import dry run
|
||||||
|
|
||||||
It's useful to preview the import by running first with `--dry-run`,
|
It's useful to preview the import by running first with `--dry-run`,
|
||||||
to sanity check the range of dates being imported,
|
to sanity check the range of dates being imported,
|
||||||
@ -160,28 +160,47 @@ as declared by [`commodity` directives](#commodity-directive) or inferred from t
|
|||||||
|
|
||||||
Related: [CSV > Amount decimal places](#amount-decimal-places).
|
Related: [CSV > Amount decimal places](#amount-decimal-places).
|
||||||
|
|
||||||
|
### Import archiving
|
||||||
|
|
||||||
|
When importing from a CSV rules file (`hledger import bank.rules`),
|
||||||
|
you can use the [archive rule](#archive) to enable automatic archiving of the data file.
|
||||||
|
After a successful import, the data file (specified by `source`) will be moved
|
||||||
|
to an archive folder (`data/`, next to the rules file, auto-created),
|
||||||
|
and renamed similar to the rules file, with a date.
|
||||||
|
This can be useful for troubleshooting, detecting variations in your banks' CSV data,
|
||||||
|
regenerating entries with improved rules, etc.
|
||||||
|
|
||||||
|
The `archive` rule also causes `import` to handle `source` glob patterns differently:
|
||||||
|
when there are multiple matched files, it will pick the oldest, not the newest.
|
||||||
|
|
||||||
### Import special cases
|
### Import special cases
|
||||||
|
|
||||||
If you have a download whose file name varies, you could rename it to a fixed name after each download.
|
#### Deduplication
|
||||||
Or you could use a [CSV `source` rule](#source) with a suitable glob pattern,
|
|
||||||
and import [from the .rules file](#reading-files-specified-by-rule) instead of the data file.
|
|
||||||
|
|
||||||
Here's a situation where you would need to run `import` with care:
|
|
||||||
say you download `bank.csv`, but forget to import it or delete it.
|
|
||||||
And next month you download it again. This time your web browser may save it as `bank (2).csv`.
|
|
||||||
So now each of these may have data not included in the other.
|
|
||||||
And a `source` rule with a glob pattern would match only the most recent file.
|
|
||||||
So in this case you should import from each one in turn, in the correct order, taking care to use the same filename each time:
|
|
||||||
|
|
||||||
```cli
|
|
||||||
$ hledger import bank.csv
|
|
||||||
$ mv 'bank (2).csv' bank.csv
|
|
||||||
$ hledger import bank.csv
|
|
||||||
```
|
|
||||||
|
|
||||||
Here are two kinds of "deduplication" which `import` does not handle
|
Here are two kinds of "deduplication" which `import` does not handle
|
||||||
(and generally should not, since these can happen legitimately in financial data):
|
(and should not, because these can happen legitimately in financial data):
|
||||||
|
|
||||||
- Two or more of the new CSV records are identical, and generate identical new journal entries.
|
- Two or more of the new CSV records are identical, and generate identical new journal entries.
|
||||||
- A new CSV record generates a journal entry identical to one(s) already in the journal.
|
- A new CSV record generates a journal entry identical to one(s) already in the journal.
|
||||||
|
|
||||||
|
#### Varying file name
|
||||||
|
|
||||||
|
If you have a download whose file name varies, you could rename it to a fixed name after each download.
|
||||||
|
Or you could use a [CSV `source` rule](#source) with a suitable glob pattern,
|
||||||
|
and import [from the .rules file](#reading-files-specified-by-rule).
|
||||||
|
|
||||||
|
#### Multiple versions
|
||||||
|
|
||||||
|
Say you download `bank.csv`, import it, but forget to delete it from your downloads folder.
|
||||||
|
The next time you download it, your web browser will save it as (eg) `bank (2).csv`.
|
||||||
|
The [source rule](#source)'s glob patterns are for just this situation:
|
||||||
|
instead of specifying `source bank.csv`, specify `source bank*.csv`.
|
||||||
|
Then `hledger -f bank.rules CMD` or `hledger import bank.rules`
|
||||||
|
will automatically pick the newest matched file (`bank (2).csv`).
|
||||||
|
|
||||||
|
Alternately, what if you download, but forget to import or delete, then download again ?
|
||||||
|
Now each of `bank.csv` and `bank (2).csv` might contain data that's not in the other, and not in your journal.
|
||||||
|
In this case, it's best to import each of them in turn, oldest first
|
||||||
|
(otherwise, overlap detection could cause new records to be skipped).
|
||||||
|
Enabling [import archiving](import-archiving) ensures this.
|
||||||
|
Then `hledger import bank.rules; hledger import bank.rules` will import and archive first `bank.csv`, then `bank (2).csv`.
|
||||||
|
|||||||
@ -3280,15 +3280,33 @@ in your system's downloads directory (`~/Downloads`, currently):
|
|||||||
source Checking1.csv
|
source Checking1.csv
|
||||||
```
|
```
|
||||||
|
|
||||||
And if you specify a glob pattern, hledger will read the most recent of the matched files
|
And if you specify a glob pattern, hledger will read the newest (most recently modified) of the matched files,
|
||||||
(useful with repeated downloads):
|
which is useful eg if your browser has saved multiple versions of a download:
|
||||||
|
|
||||||
```rules
|
```rules
|
||||||
source Checking1*.csv
|
source Checking1*.csv
|
||||||
```
|
```
|
||||||
|
|
||||||
|
This enables a convenient workflow where you just download CSV files to the default place, then run `hledger import rules/*`.
|
||||||
|
Once they have been imported, you can discard them or ignore them.
|
||||||
|
|
||||||
See also ["Working with CSV > Reading files specified by rule"](#reading-files-specified-by-rule).
|
See also ["Working with CSV > Reading files specified by rule"](#reading-files-specified-by-rule).
|
||||||
|
|
||||||
|
## `archive`
|
||||||
|
|
||||||
|
The `archive` rule can be used together with `source` to make importing a little more convenient.
|
||||||
|
It affects only the [import](#import) command. When enabled,
|
||||||
|
|
||||||
|
- `import` will process multiple `source` glob matches oldest first.
|
||||||
|
So if you have multiple versions of a download, repeated imports will process them in chronological order.
|
||||||
|
|
||||||
|
- After successfully importing a `source`-specified file,
|
||||||
|
`import` will move it to an archive directory (`data/` next to the rules file, auto-created),
|
||||||
|
and rename it to `RULESFILENAME.MODIFICATIONDATE.DOWNLOADEXT`.
|
||||||
|
|
||||||
|
Archiving imported files in this way is completely optional, but it can be useful for troubleshooting,
|
||||||
|
detecting variations in your banks' CSV data, regenerating entries with improved rules, etc.
|
||||||
|
|
||||||
## `encoding`
|
## `encoding`
|
||||||
|
|
||||||
```rules
|
```rules
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user