feat:csv: support data cleaning scripts

This commit is contained in:
Simon Michael 2025-08-22 16:31:53 +01:00
parent c3e85ce9f7
commit c515fedf70
2 changed files with 144 additions and 66 deletions

View File

@ -45,6 +45,7 @@ where
--- ** imports
import Prelude hiding (Applicative(..))
import Control.Applicative (Applicative(..))
import Control.Concurrent (forkIO)
import Control.DeepSeq (deepseq)
import Control.Monad (unless, void, when)
import Control.Monad.Except (ExceptT(..), liftEither, throwError)
@ -54,7 +55,11 @@ import Control.Monad.State.Strict (StateT, get, modify', evalStateT)
import Control.Monad.Trans.Class (lift)
import Data.Char (toLower, isDigit, isSpace, isAlphaNum, ord)
import Data.Bifunctor (first)
import Data.Encoding (encodingFromStringExplicit)
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as BL
import qualified Data.Csv as Cassava
import qualified Data.Csv.Parser.Megaparsec as CassavaMegaparsec
import Data.Encoding (encodingFromStringExplicit)
import Data.Either (fromRight)
import Data.Functor ((<&>))
import Data.List (elemIndex, mapAccumL, nub, sortOn, isPrefixOf, sortBy)
@ -74,12 +79,10 @@ import Data.Time ( Day, TimeZone, UTCTime, LocalTime, ZonedTime(ZonedTime),
defaultTimeLocale, getCurrentTimeZone, localDay, parseTimeM, utcToLocalTime, localTimeToUTC, zonedTimeToUTC, utctDay)
import Safe (atMay, headMay, lastMay, readMay, headDef)
import System.Directory (createDirectoryIfMissing, doesFileExist, getHomeDirectory, getModificationTime, listDirectory, renameFile, doesDirectoryExist)
import System.Exit (ExitCode(..))
import System.FilePath (stripExtension, takeBaseName, takeDirectory, takeExtension, takeFileName, (<.>), (</>))
import System.IO (Handle, hClose, hPutStrLn, stderr)
import qualified Data.Csv as Cassava
import qualified Data.Csv.Parser.Megaparsec as CassavaMegaparsec
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as BL
import System.IO (Handle, hClose, hPutStrLn, stderr, hGetContents')
import System.Process (CreateProcess(..), StdStream(CreatePipe), shell, waitForProcess, withCreateProcess)
import Data.Foldable (asum, toList)
import Text.Megaparsec hiding (match, parse)
import Text.Megaparsec.Char (char, newline, string, digitChar)
@ -113,17 +116,35 @@ getDownloadDir = do
return $ home </> "Downloads" -- XXX
-- | Read, parse and post-process a "Journal" from the given rules file, or give an error.
-- This particular reader also provides some extra features like data-cleaning and archiving.
--
-- The provided input file handle, and the --rules option, are ignored by this reader.
-- Instead, a data file (or data-generating command) is usually specified by the @source@ rule.
-- If there's no source rule, the data file is assumed to be named like the rules file without .rules, in the same directory.
--
-- The provided handle, or a --rules option, are ignored by this reader.
-- A data file is inferred from the @source@ rule, otherwise from a similarly-named file in the same directory.
-- The source rule supports ~ for home directory.
-- If it is a bare filename, its directory is assumed to be ~/Downloads.
-- If the argument is a bare filename, its directory is assumed to be ~/Downloads.
-- If is a relative file path, it is assumed to be relative to the rules file's directory.
--
-- The source rule can specify a glob pattern.
-- If the glob pattern matches multiple files, the newest (last modified) file is used,
-- unless the import command is running and archiving is enabled, in which case the oldest file is used.
-- When the import command is running and archiving is enabled, after a successful read
-- the data file is archived in an archive directory (data/ next to the rules file, auto-created).
-- If the glob pattern matches multiple files, the newest (last modified) file is used (see also below).
--
-- The source rule can specify a data-cleaning command, after the file pattern and a | separator.
-- This command is executed by the user's default shell, receives the data file's content on stdin,
-- and should output data suitable for hledger to convert with CSV rules.
-- A # character can be used to comment out the data-cleaning command.
--
-- When using the source rule, if the archive rule is also present, some behaviours change:
--
-- - The import command:
-- will move the data file to an archive directory after a successful read
-- (renamed like the rules file, date-stamped, to an auto-created data/ directory next to the rules file).
-- And it will read the oldest data file, not the newest, if the glob pattern matches multiple files.
-- If there is a data-cleaning command, only the original uncleaned data is archived, currently.
--
-- - Other commands:
-- will read the newest archived data file, if any, as a fallback if the glob pattern matches no data files.
--
-- Balance assertions are not checked by this reader.
--
parse :: InputOpts -> FilePath -> Handle -> ExceptT String IO Journal
@ -137,17 +158,30 @@ parse iopts rulesfile h = do
-- XXX How can we know when the command is import, and if it's a dry run ? In a hacky way, currently.
args = progArgs
cmd = headDef "" $ dropWhile ((=="-").take 1) args
importcmd = dbg7 "importcmd" $ cmd `elem` ["import", "imp"]
cmdisimport = dbg7 "cmdisimport" $ cmd `elem` ["import", "imp"]
dryrun = dbg7 "dryrun" $ any (`elem` args) ["--dry-run", "--dry"]
importing = dbg7 "importing" $ importcmd && not dryrun
importing = dbg7 "importing" $ cmdisimport && not dryrun
archive = dbg7 "archive" $ isJust (getDirective "archive" rules)
archiving = dbg7 "archiving" $ importing && archive
rulesdir = dbg7 "rulesdir" $ takeDirectory rulesfile
archivedir = dbg7 "archivedir" $ rulesdir </> "data"
mdatafile <- liftIO $ do
mdatafileandcmd <- liftIO $ do
dldir <- getDownloadDir -- look here for the data file if it's specified without a directory
let msource = T.unpack <$> getDirective "source" rules
let
msourcearg = getDirective "source" rules
-- Surrounding whitespace is removed from the whole source argument and from each part of it.
-- A # before | makes the rest of line a comment.
-- A # after | is left for the shell to interpret; it could be part of the command or the start of a comment.
stripspaces = T.strip
stripcommentandspaces = stripspaces . T.takeWhile (/= '#')
msourceandcmd = T.breakOn "|" . stripspaces <$> msourcearg
msource = T.unpack . stripcommentandspaces . fst <$> msourceandcmd
mcmd = msourceandcmd >>= \sc ->
let c = T.unpack . stripspaces . T.drop 1 . snd $ sc
in if null c then Nothing else Just c
datafiles <- case msource of
Nothing -> return [maybe err (dbg4 "inferred source") $ dataFileFor rulesfile] -- shouldn't fail, f has .rules extension
where err = error' $ "could not infer a data file for " <> rulesfile
@ -156,55 +190,98 @@ parse iopts rulesfile h = do
globmatches <- expandGlob dir (dbg4 "source rule" glb) >>= sortByModTime <&> dbg4 ("matched files"<>desc<>", oldest first")
case globmatches of
-- if the source rule matched no files, and we are reading not importing, use the most recent archive file
[] | archive && not importcmd -> do
[] | archive && not cmdisimport -> do
archivesFor archivedir rulesfile <&> take 1 <&> dbg4 "latest file in archive directory"
_ -> return globmatches
_ -> return globmatches
return $ case datafiles of
[] -> Nothing
[f] | importcmd -> dbg4 "importing" <$> Just f
[f] -> dbg4 "reading" <$> Just f
fs | importcmd && archiving -> dbg4 "importing oldest file" <$> headMay fs
fs | importcmd -> dbg4 "importing newest file" <$> lastMay fs
fs -> dbg4 "reading newest file" <$> lastMay fs
[] -> (Nothing, Nothing)
[f] | cmdisimport -> dbg4 "importing" (Just f , mcmd)
[f] -> dbg4 "reading" (Just f , mcmd)
fs | cmdisimport && archiving -> dbg4 "importing oldest file" (headMay fs, mcmd)
fs | cmdisimport -> dbg4 "importing newest file" (lastMay fs, mcmd)
fs -> dbg4 "reading newest file" (lastMay fs, mcmd)
case mdatafile of
Nothing -> return nulljournal -- data file specified by source rule was not found
Just datafile -> do
case mdatafileandcmd of
(Nothing, _) -> return nulljournal -- data file specified by source rule was not found
(Just datafile, mcmd) -> do
exists <- liftIO $ doesFileExist datafile
if not (datafile=="-" || exists)
then return nulljournal -- data file inferred from rules file name was not found
else do
datafileh <- liftIO $ openFileOrStdin datafile
readJournalFromCsv (Just $ Left rules) datafile datafileh Nothing
-- apply any command line account aliases. Can fail with a bad replacement pattern.
>>= liftEither . journalApplyAliases (aliasesFromOpts iopts)
-- journalFinalise assumes the journal's items are
-- reversed, as produced by JournalReader's parser.
-- But here they are already properly ordered. So we'd
-- better preemptively reverse them once more. XXX inefficient
. journalReverse
>>= journalFinalise iopts{balancingopts_=(balancingopts_ iopts){ignore_assertions_=True}} rulesfile ""
>>= \j -> do
when archiving $ liftIO $ archiveTo rulesfile datafile archivedir
return j
datafileh <- liftIO $ openFileOrStdin datafile
rawdata <- liftIO $ readHandlePortably datafileh
cleandata <- liftIO $ maybe (return rawdata) (\c -> runFilterCommand rulesfile c rawdata) mcmd
cleandatafileh <- liftIO $ inputToHandle cleandata
do
readJournalFromCsv (Just $ Left rules) datafile cleandatafileh Nothing
-- apply any command line account aliases. Can fail with a bad replacement pattern.
>>= liftEither . journalApplyAliases (aliasesFromOpts iopts)
-- journalFinalise assumes the journal's items are
-- reversed, as produced by JournalReader's parser.
-- But here they are already properly ordered. So we'd
-- better preemptively reverse them once more. XXX inefficient
. journalReverse
>>= journalFinalise iopts{balancingopts_=(balancingopts_ iopts){ignore_assertions_=True}} rulesfile ""
>>= \j -> do
when archiving $ liftIO $ saveToArchive archivedir rulesfile datafile (mcmd <&> const cleandata)
return j
-- | Move a file to the given directory, creating the directory (and parents) if needed,
-- showing informational output on stderr.
archiveTo :: FilePath -> FilePath -> FilePath -> IO ()
archiveTo rulesfile datafile archivedir = do
-- | Run the given shell command, passing the given text as input, and return the output.
-- Or if the command fails, raise an informative error.
runFilterCommand :: FilePath -> String -> Text -> IO Text
runFilterCommand rulesfile cmd input = do
let process = (shell cmd) { std_in = CreatePipe, std_out = CreatePipe, std_err = CreatePipe }
withCreateProcess process $ \mhin mhout mherr phandle -> do
case (mhin, mhout, mherr) of
(Just hin, Just hout, Just herr) -> do
forkIO $ T.hPutStr hin input >> hClose hin
out <- T.hGetContents hout
err <- hGetContents' herr
exitCode <- waitForProcess phandle
case exitCode of
ExitSuccess -> return out
ExitFailure code ->
error' $ "in " ++ rulesfile ++ ": command \"" ++ cmd ++ "\" failed with exit code " ++ show code
++ (if null err then "" else ":\n" ++ err)
_ -> error' $ "in " ++ rulesfile ++ ": failed to create pipes for command execution"
-- | Save some successfully imported data to the given archive directory,
-- autocreating that if needed, and showing informational output on stderr.
-- The remaining arguments are: the rules file path (for naming), the original data file,
-- and if there was a data-cleaning command, the cleaned data from that file.
-- The archive file name will be RULESFILEBASENAME.DATAFILEMODDATE.DATAFILEEXT.
-- When there is cleaned data, the original data is also saved, as
-- RULESFILEBASENAME.orig.DATAFILEMODDATE.DATAFILEEXT.
saveToArchive :: FilePath -> FilePath -> FilePath -> Maybe Text -> IO ()
saveToArchive archivedir rulesfile datafile mcleandata = do
createDirectoryIfMissing True archivedir
hPutStrLn stderr $ "archiving " <> datafile
fname <- archiveFileName rulesfile datafile
let archivefile = archivedir </> fname
hPutStrLn stderr $ " as " <> archivefile
renameFile datafile archivefile
(origname, cleanname) <- archiveFileName rulesfile datafile
let
origarchive = archivedir </> origname
cleanarchive = archivedir </> cleanname
case mcleandata of
Just cleandata -> do
hPutStrLn stderr $ " as " <> origarchive
renameFile datafile origarchive
hPutStrLn stderr $ " and " <> cleanarchive
T.writeFile cleanarchive cleandata
Nothing -> do
hPutStrLn stderr $ " as " <> cleanarchive
renameFile datafile cleanarchive
-- | Figure out the file name to use when archiving, for the given rules file, the given data file.
-- That is, "RULESFILEBASENAME.DATAFILEMODDATE.DATAFILEEXT".
archiveFileName :: FilePath -> FilePath -> IO String
-- | Figure out the file names to use when archiving, for the given rules file, the given data file.
-- The second name is for the final (possibly cleaned) data; the first name has ".orig" added,
-- and is used if both original and cleaned data are being archived. They will be like this:
-- ("RULESFILEBASENAME.orig.DATAFILEMODDATE.DATAFILEEXT", "RULESFILEBASENAME.DATAFILEMODDATE.DATAFILEEXT")
archiveFileName :: FilePath -> FilePath -> IO (String, String)
archiveFileName rulesfile datafile = do
moddate <- (show . utctDay) <$> getModificationTime datafile
return $ takeBaseName rulesfile <.> moddate <.> takeExtension datafile
let (base, ext) = (takeBaseName rulesfile, takeExtension datafile)
return (
base <.> "orig" <.> moddate <.> ext
,base <.> moddate <.> ext
)
-- | In the given archive directory, if it exists, find the paths of data files saved for the given rules file.
-- They will be reverse sorted by name, ie newest first, assuming normal archive file names.

View File

@ -3241,6 +3241,7 @@ The following kinds of rule can appear in the rules file, in any order.
| | |
|-------------------------------------------------|------------------------------------------------------------------------------------------------|
| [**`source`**](#source) | optionally declare which file to read data from |
| [**`archive`**](#archive) | optionally enable an archive of imported files |
| [**`encoding`**](#encoding) | optionally declare which text encoding the data has |
| [**`separator`**](#separator) | declare the field separator, instead of relying on file extension |
| [**`skip`**](#skip) | skip one or more header lines at start of file |
@ -3293,24 +3294,24 @@ All this enables a convenient workflow where can you just download CSV files, th
See also ["Working with CSV > Reading files specified by rule"](#reading-files-specified-by-rule).
The `archive` rule adds a few more features to `source`; see below.
### Data cleaning
After `source`'s file pattern, you can write `|` (pipe) and a data cleaning command.
If hledger's CSV rules aren't enough, you can pre-process the downloaded data here with a shell command or script, to make it more suitable for conversion.
The command will be executed by your default shell, will receive the data file's content as standard input,
and should output zero or more lines of character-separated-values, ready for conversion by hledger's CSV rules.
## `archive`
Adding the `archive` rule to your rules file affects importing or reading files specified by `source`:
Adding the `archive` rule causes `import` to archive imported data files to a nearby `data/` directory.
This is optional, but can be useful for troubleshooting, regenerating with improved rules, etc.
- After successfully importing, `import` will move the data file to an archive directory
(`data/` next to the rules file, auto-created),
renamed to `RULESFILEBASENAME.DATAFILEMODDATE.DATAFILEEXT`.
Archiving data files is optional, but it can be useful for troubleshooting,
detecting variations in your banks' CSV data, regenerating entries with improved rules, etc.
Also, it causes `import` to prefer the oldest data file, when the `source` rule's glob pattern matches multiple files.
So multiple downloads will be imported and archived in chronological order (oldest first).
- `import` will pick the oldest of `source` glob matches, rather than the newest.
So if you have multiple versions of a download, repeated imports will process them in chronological order.
- For commands other than `import`, when the `source` path or glob pattern matches no files,
hledger will try to read the latest archived data file instead.
This is convenient for working with the downloaded data again, even after it has been imported.
`archive` also affects non-`import` commands reading the rules file:
when the `source` rule's glob pattern matches no files (no new downloads are available),
they will use the archive as a fallback (reading the newest archived file, if any).
## `encoding`