feat:csv: support data cleaning scripts

2025-08-22 16:31:53 +01:00 · 2025-08-22 16:31:53 +01:00 · c515fedf70
commit c515fedf70
parent c3e85ce9f7
2 changed files with 144 additions and 66 deletions
--- a/hledger-lib/Hledger/Read/RulesReader.hs
+++ b/hledger-lib/Hledger/Read/RulesReader.hs
@ -45,6 +45,7 @@ where
 --- ** imports
 import Prelude hiding (Applicative(..))
 import Control.Applicative (Applicative(..))
 import Control.Concurrent (forkIO)
 import Control.DeepSeq (deepseq)
 import Control.Monad (unless, void, when)
 import Control.Monad.Except       (ExceptT(..), liftEither, throwError)
@ -54,7 +55,11 @@ import Control.Monad.State.Strict (StateT, get, modify', evalStateT)
 import Control.Monad.Trans.Class  (lift)
 import Data.Char                  (toLower, isDigit, isSpace, isAlphaNum, ord)
 import Data.Bifunctor             (first)
-import Data.Encoding              (encodingFromStringExplicit)
+import qualified Data.ByteString as B
 import qualified Data.ByteString.Lazy as BL
 import qualified Data.Csv as Cassava
 import qualified Data.Csv.Parser.Megaparsec as CassavaMegaparsec
 import Data.Encoding (encodingFromStringExplicit)
 import Data.Either (fromRight)
 import Data.Functor ((<&>))
 import Data.List (elemIndex, mapAccumL, nub, sortOn, isPrefixOf, sortBy)
@ -74,12 +79,10 @@ import Data.Time ( Day, TimeZone, UTCTime, LocalTime, ZonedTime(ZonedTime),
  defaultTimeLocale, getCurrentTimeZone, localDay, parseTimeM, utcToLocalTime, localTimeToUTC, zonedTimeToUTC, utctDay)
 import Safe (atMay, headMay, lastMay, readMay, headDef)
 import System.Directory (createDirectoryIfMissing, doesFileExist, getHomeDirectory, getModificationTime, listDirectory, renameFile, doesDirectoryExist)
 import System.Exit      (ExitCode(..))
 import System.FilePath (stripExtension, takeBaseName, takeDirectory, takeExtension, takeFileName, (<.>), (</>))
-import System.IO       (Handle, hClose, hPutStrLn, stderr)
+import System.IO       (Handle, hClose, hPutStrLn, stderr, hGetContents')
-import qualified Data.Csv as Cassava
+import System.Process  (CreateProcess(..), StdStream(CreatePipe), shell, waitForProcess, withCreateProcess)
 import qualified Data.Csv.Parser.Megaparsec as CassavaMegaparsec
 import qualified Data.ByteString as B
 import qualified Data.ByteString.Lazy as BL
 import Data.Foldable (asum, toList)
 import Text.Megaparsec hiding (match, parse)
 import Text.Megaparsec.Char (char, newline, string, digitChar)
@ -113,17 +116,35 @@ getDownloadDir = do
  return $ home </> "Downloads"  -- XXX
 -- | Read, parse and post-process a "Journal" from the given rules file, or give an error.
 -- This particular reader also provides some extra features like data-cleaning and archiving.
 --
 -- The provided input file handle, and the --rules option, are ignored by this reader.
 -- Instead, a data file (or data-generating command) is usually specified by the @source@ rule.
 -- If there's no source rule, the data file is assumed to be named like the rules file without .rules, in the same directory.
 --
 -- The provided handle, or a --rules option, are ignored by this reader.
 -- A data file is inferred from the @source@ rule, otherwise from a similarly-named file in the same directory.
 -- The source rule supports ~ for home directory.
-- If it is a bare filename, its directory is assumed to be ~/Downloads.
+-- If the argument is a bare filename, its directory is assumed to be ~/Downloads.
 -- If is a relative file path, it is assumed to be relative to the rules file's directory.
 --
 -- The source rule can specify a glob pattern.
-- If the glob pattern matches multiple files, the newest (last modified) file is used,
+-- If the glob pattern matches multiple files, the newest (last modified) file is used (see also below).
-- unless the import command is running and archiving is enabled, in which case the oldest file is used.
+--
-- When the import command is running and archiving is enabled, after a successful read
+-- The source rule can specify a data-cleaning command, after the file pattern and a | separator.
-- the data file is archived in an archive directory (data/ next to the rules file, auto-created).
+-- This command is executed by the user's default shell, receives the data file's content on stdin,
 -- and should output data suitable for hledger to convert with CSV rules.
 -- A # character can be used to comment out the data-cleaning command.
 --
 -- When using the source rule, if the archive rule is also present, some behaviours change:
 --
 -- - The import command:
 --   will move the data file to an archive directory after a successful read
 --   (renamed like the rules file, date-stamped, to an auto-created data/ directory next to the rules file).
 --   And it will read the oldest data file, not the newest, if the glob pattern matches multiple files.
 --   If there is a data-cleaning command, only the original uncleaned data is archived, currently.
 --
 -- - Other commands:
 --   will read the newest archived data file, if any, as a fallback if the glob pattern matches no data files.
 --
 -- Balance assertions are not checked by this reader.
 --
 parse :: InputOpts -> FilePath -> Handle -> ExceptT String IO Journal
@ -137,17 +158,30 @@ parse iopts rulesfile h = do
    -- XXX How can we know when the command is import, and if it's a dry run ? In a hacky way, currently.
    args = progArgs
    cmd = headDef "" $ dropWhile ((=="-").take 1) args
-    importcmd  = dbg7 "importcmd"  $ cmd `elem` ["import", "imp"]
+    cmdisimport = dbg7 "cmdisimport" $ cmd `elem` ["import", "imp"]
    dryrun     = dbg7 "dryrun"     $ any (`elem` args) ["--dry-run", "--dry"]
-    importing  = dbg7 "importing"  $ importcmd && not dryrun
+    importing  = dbg7 "importing"  $ cmdisimport && not dryrun
    archive    = dbg7 "archive"    $ isJust (getDirective "archive" rules)
    archiving  = dbg7 "archiving"  $ importing && archive
    rulesdir   = dbg7 "rulesdir"   $ takeDirectory rulesfile
    archivedir = dbg7 "archivedir" $ rulesdir </> "data"
-  mdatafile <- liftIO $ do
+  mdatafileandcmd <- liftIO $ do
    dldir <- getDownloadDir  -- look here for the data file if it's specified without a directory
-    let msource = T.unpack <$> getDirective "source" rules
+    let
      msourcearg = getDirective "source" rules
      -- Surrounding whitespace is removed from the whole source argument and from each part of it.
      -- A # before | makes the rest of line a comment.
      -- A # after | is left for the shell to interpret; it could be part of the command or the start of a comment.
      stripspaces = T.strip
      stripcommentandspaces = stripspaces . T.takeWhile (/= '#')
      msourceandcmd = T.breakOn "|" . stripspaces <$> msourcearg
      msource = T.unpack . stripcommentandspaces . fst <$> msourceandcmd
      mcmd = msourceandcmd >>= \sc ->
        let c = T.unpack . stripspaces . T.drop 1 . snd $ sc
        in if null c then Nothing else Just c
    datafiles <- case msource of
      Nothing  -> return [maybe err (dbg4 "inferred source") $ dataFileFor rulesfile]  -- shouldn't fail, f has .rules extension
        where err = error' $ "could not infer a data file for " <> rulesfile
@ -156,55 +190,98 @@ parse iopts rulesfile h = do
        globmatches <- expandGlob dir (dbg4 "source rule" glb) >>= sortByModTime <&> dbg4 ("matched files"<>desc<>", oldest first")
        case globmatches of
          -- if the source rule matched no files, and we are reading not importing, use the most recent archive file
-          [] | archive && not importcmd -> do
+          [] | archive && not cmdisimport -> do
            archivesFor archivedir rulesfile <&> take 1 <&> dbg4 "latest file in archive directory"
-          _  -> return globmatches
+          _ -> return globmatches
    return $ case datafiles of
-      []                          -> Nothing
+      []                            -> (Nothing, Nothing)
-      [f] | importcmd             -> dbg4 "importing"             <$> Just f
+      [f] | cmdisimport             -> dbg4 "importing"             (Just f    , mcmd)
-      [f]                         -> dbg4 "reading"               <$> Just f
+      [f]                           -> dbg4 "reading"               (Just f    , mcmd)
-      fs | importcmd && archiving -> dbg4 "importing oldest file" <$> headMay fs
+      fs | cmdisimport && archiving -> dbg4 "importing oldest file" (headMay fs, mcmd)
-      fs | importcmd              -> dbg4 "importing newest file" <$> lastMay fs
+      fs | cmdisimport              -> dbg4 "importing newest file" (lastMay fs, mcmd)
-      fs                          -> dbg4 "reading newest file"   <$> lastMay fs
+      fs                            -> dbg4 "reading newest file"   (lastMay fs, mcmd)
-  case mdatafile of
+  case mdatafileandcmd of
-    Nothing -> return nulljournal  -- data file specified by source rule was not found
+    (Nothing, _) -> return nulljournal  -- data file specified by source rule was not found
-    Just datafile -> do
+    (Just datafile, mcmd) -> do
      exists <- liftIO $ doesFileExist datafile
      if not (datafile=="-" || exists)
      then return nulljournal      -- data file inferred from rules file name was not found
      else do
-        datafileh <- liftIO $ openFileOrStdin datafile
+        datafileh      <- liftIO $ openFileOrStdin datafile
-        readJournalFromCsv (Just $ Left rules) datafile datafileh Nothing
+        rawdata        <- liftIO $ readHandlePortably datafileh
-        -- apply any command line account aliases. Can fail with a bad replacement pattern.
+        cleandata      <- liftIO $ maybe (return rawdata) (\c -> runFilterCommand rulesfile c rawdata) mcmd
-        >>= liftEither . journalApplyAliases (aliasesFromOpts iopts)
+        cleandatafileh <- liftIO $ inputToHandle cleandata
-            -- journalFinalise assumes the journal's items are
+        do
-            -- reversed, as produced by JournalReader's parser.
+          readJournalFromCsv (Just $ Left rules) datafile cleandatafileh Nothing
-            -- But here they are already properly ordered. So we'd
+          -- apply any command line account aliases. Can fail with a bad replacement pattern.
-            -- better preemptively reverse them once more. XXX inefficient
+          >>= liftEither . journalApplyAliases (aliasesFromOpts iopts)
-            . journalReverse
+              -- journalFinalise assumes the journal's items are
-        >>= journalFinalise iopts{balancingopts_=(balancingopts_ iopts){ignore_assertions_=True}} rulesfile ""
+              -- reversed, as produced by JournalReader's parser.
-        >>= \j -> do
+              -- But here they are already properly ordered. So we'd
-          when archiving $ liftIO $ archiveTo rulesfile datafile archivedir
+              -- better preemptively reverse them once more. XXX inefficient
-          return j
+              . journalReverse
          >>= journalFinalise iopts{balancingopts_=(balancingopts_ iopts){ignore_assertions_=True}} rulesfile ""
          >>= \j -> do
            when archiving $ liftIO $ saveToArchive archivedir rulesfile datafile (mcmd <&> const cleandata)
            return j
-- | Move a file to the given directory, creating the directory (and parents) if needed,
+-- | Run the given shell command, passing the given text as input, and return the output.
-- showing informational output on stderr.
+-- Or if the command fails, raise an informative error.
-archiveTo :: FilePath -> FilePath -> FilePath -> IO ()
+runFilterCommand :: FilePath -> String -> Text -> IO Text
-archiveTo rulesfile datafile archivedir = do
+runFilterCommand rulesfile cmd input = do
  let process = (shell cmd) { std_in = CreatePipe, std_out = CreatePipe, std_err = CreatePipe }
  withCreateProcess process $ \mhin mhout mherr phandle -> do
    case (mhin, mhout, mherr) of
      (Just hin, Just hout, Just herr) -> do
        forkIO $ T.hPutStr hin input >> hClose hin
        out <- T.hGetContents hout
        err <- hGetContents' herr
        exitCode <- waitForProcess phandle
        case exitCode of
          ExitSuccess -> return out
          ExitFailure code ->
            error' $ "in " ++ rulesfile ++ ": command \"" ++ cmd ++ "\" failed with exit code " ++ show code
              ++ (if null err then "" else ":\n" ++ err)
      _ -> error' $ "in " ++ rulesfile ++ ": failed to create pipes for command execution"
 -- | Save some successfully imported data to the given archive directory,
 -- autocreating that if needed, and showing informational output on stderr.
 -- The remaining arguments are: the rules file path (for naming), the original data file,
 -- and if there was a data-cleaning command, the cleaned data from that file.
 -- The archive file name will be RULESFILEBASENAME.DATAFILEMODDATE.DATAFILEEXT.
 -- When there is cleaned data, the original data is also saved, as 
 -- RULESFILEBASENAME.orig.DATAFILEMODDATE.DATAFILEEXT.
 saveToArchive :: FilePath -> FilePath -> FilePath -> Maybe Text -> IO ()
 saveToArchive archivedir rulesfile datafile mcleandata = do
  createDirectoryIfMissing True archivedir
  hPutStrLn stderr $ "archiving " <> datafile
-  fname <- archiveFileName rulesfile datafile
+  (origname, cleanname) <- archiveFileName rulesfile datafile
-  let archivefile = archivedir </> fname
+  let
-  hPutStrLn stderr $ " as " <> archivefile
+    origarchive  = archivedir </> origname
-  renameFile datafile archivefile
+    cleanarchive = archivedir </> cleanname
  case mcleandata of
    Just cleandata -> do
      hPutStrLn stderr $ " as " <> origarchive
      renameFile datafile origarchive
      hPutStrLn stderr $ " and " <> cleanarchive
      T.writeFile cleanarchive cleandata
    Nothing -> do
      hPutStrLn stderr $ " as " <> cleanarchive
      renameFile datafile cleanarchive
-- | Figure out the file name to use when archiving, for the given rules file, the given data file.
+-- | Figure out the file names to use when archiving, for the given rules file, the given data file.
-- That is, "RULESFILEBASENAME.DATAFILEMODDATE.DATAFILEEXT".
+-- The second name is for the final (possibly cleaned) data; the first name has ".orig" added,
-archiveFileName :: FilePath -> FilePath -> IO String
+-- and is used if both original and cleaned data are being archived. They will be like this:
 -- ("RULESFILEBASENAME.orig.DATAFILEMODDATE.DATAFILEEXT", "RULESFILEBASENAME.DATAFILEMODDATE.DATAFILEEXT")
 archiveFileName :: FilePath -> FilePath -> IO (String, String)
 archiveFileName rulesfile datafile = do
  moddate <- (show . utctDay) <$> getModificationTime datafile
-  return $ takeBaseName rulesfile <.> moddate <.> takeExtension datafile
+  let (base, ext) = (takeBaseName rulesfile, takeExtension datafile)
  return (
     base <.> "orig" <.> moddate <.> ext
    ,base            <.> moddate <.> ext
    )
 -- | In the given archive directory, if it exists, find the paths of data files saved for the given rules file.
 -- They will be reverse sorted by name, ie newest first, assuming normal archive file names.
--- a/hledger/hledger.m4.md
+++ b/hledger/hledger.m4.md
@ -3241,6 +3241,7 @@ The following kinds of rule can appear in the rules file, in any order.
 |                                                 |                                                                                                |
 |-------------------------------------------------|------------------------------------------------------------------------------------------------|
 | [**`source`**](#source)                         | optionally declare which file to read data from                                                |
 | [**`archive`**](#archive)                       | optionally enable an archive of imported files                                                 |
 | [**`encoding`**](#encoding)                     | optionally declare which text encoding the data has                                            |
 | [**`separator`**](#separator)                   | declare the field separator, instead of relying on file extension                              |
 | [**`skip`**](#skip)                             | skip one or more header lines at start of file                                                 |
@ -3293,24 +3294,24 @@ All this enables a convenient workflow where can you just download CSV files, th
 See also ["Working with CSV > Reading files specified by rule"](#reading-files-specified-by-rule).
-The `archive` rule adds a few more features to `source`; see below.
+### Data cleaning
 After `source`'s file pattern, you can write `|` (pipe) and a data cleaning command.
 If hledger's CSV rules aren't enough, you can pre-process the downloaded data here with a shell command or script, to make it more suitable for conversion.
 The command will be executed by your default shell, will receive the data file's content as standard input,
 and should output zero or more lines of character-separated-values, ready for conversion by hledger's CSV rules.
 ## `archive`
-Adding the `archive` rule to your rules file affects importing or reading files specified by `source`:
+Adding the `archive` rule causes `import` to archive imported data files to a nearby `data/` directory.
 This is optional, but can be useful for troubleshooting, regenerating with improved rules, etc.
- After successfully importing, `import` will move the data file to an archive directory
+Also, it causes `import` to prefer the oldest data file, when the `source` rule's glob pattern matches multiple files.
-  (`data/` next to the rules file, auto-created),
+So multiple downloads will be imported and archived in chronological order (oldest first).
  renamed to `RULESFILEBASENAME.DATAFILEMODDATE.DATAFILEEXT`.
  Archiving data files is optional, but it can be useful for troubleshooting,
  detecting variations in your banks' CSV data, regenerating entries with improved rules, etc.
- `import` will pick the oldest of `source` glob matches, rather than the newest.
+`archive` also affects non-`import` commands reading the rules file:
-  So if you have multiple versions of a download, repeated imports will process them in chronological order.
+when the `source` rule's glob pattern matches no files (no new downloads are available),
-
+they will use the archive as a fallback (reading the newest archived file, if any).
 - For commands other than `import`, when the `source` path or glob pattern matches no files,
  hledger will try to read the latest archived data file instead.
  This is convenient for working with the downloaded data again, even after it has been imported.
 ## `encoding`