fix:csv: respect encoding rule when rules file is input file [#2465]

This commit is contained in:
Simon Michael 2025-09-25 09:08:50 -10:00
parent 1100c7e62e
commit 9351c70f74
5 changed files with 66 additions and 35 deletions

View File

@ -33,7 +33,8 @@ import System.IO (Handle)
import Hledger.Data
import Hledger.Utils
import Hledger.Read.Common (aliasesFromOpts, Reader(..), InputOpts(..), journalFinalise)
import Hledger.Read.RulesReader (readJournalFromCsv)
import Hledger.Read.RulesReader (readJournalFromCsv, getRulesFile, rulesEncoding, readRules)
import Control.Monad.Trans (lift)
--- ** doctest setup
-- $setup
@ -60,8 +61,10 @@ reader sep = Reader
-- This does not check balance assertions.
parse :: SepFormat -> InputOpts -> FilePath -> Handle -> ExceptT String IO Journal
parse sep iopts f h = do
let mrulesfile = mrules_file_ iopts
readJournalFromCsv (Right <$> mrulesfile) f h (Just sep)
rules <- readRules $ getRulesFile f (mrules_file_ iopts)
mencoding <- rulesEncoding rules
csvtext <- lift $ readHandlePortably' mencoding h
readJournalFromCsv rules f csvtext (Just sep)
-- apply any command line account aliases. Can fail with a bad replacement pattern.
>>= liftEither . journalApplyAliases (aliasesFromOpts iopts)
-- journalFinalise assumes the journal's items are

View File

@ -29,13 +29,12 @@ module Hledger.Read.RulesReader (
-- * Reader
reader,
-- * Misc.
readJournalFromCsv,
-- readRulesFile,
-- parseCsvRules,
-- validateCsvRules,
-- CsvRules,
dataFileFor,
rulesFileFor,
getRulesFile,
readRules,
rulesEncoding,
readJournalFromCsv,
parseBalanceAssertionType,
-- * Tests
tests_RulesReader,
@ -59,7 +58,7 @@ import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as BL
import qualified Data.Csv as Cassava
import qualified Data.Csv.Parser.Megaparsec as CassavaMegaparsec
import Data.Encoding (encodingFromStringExplicit)
import Data.Encoding (encodingFromStringExplicit, DynEncoding)
import Data.Either (fromRight)
import Data.Functor ((<&>))
import Data.List (elemIndex, mapAccumL, nub, sortOn)
@ -168,7 +167,7 @@ parse iopts rulesfile h = do
-- gives: file pattern, data cleaning/generating command, archive flag
-- XXX higher-than usual logging priority for file reading (normally 6 or 7), to bypass excessive noise from elsewhere
rules <- readRulesFile $ dbg1 "reading rules file" rulesfile
rules <- readRules $ dbg1 "reading rules file" rulesfile
let
msourcearg = getDirective "source" rules
-- Nothing -> error' $ rulesfile ++ " source rule must specify a file pattern or a command"
@ -219,7 +218,7 @@ parse iopts rulesfile h = do
(Nothing, _) -> return ()
-- 5. read raw, cleaned or generated data
-- needs: file pattern, data file, data command
-- needs: file pattern, data file, optional data file encoding, data command
-- gives: clean data (possibly empty)
mexistingdatafile <- maybe (return Nothing) (\f -> liftIO $ do
@ -233,9 +232,10 @@ parse iopts rulesfile h = do
return ""
-- file found, and maybe a data cleaning command
(_, Just f, mc) -> -- trace "file found" $
(_, Just f, mc) -> do -- trace "file found" $
mencoding <- rulesEncoding rules
liftIO $ do
raw <- openFileOrStdin f >>= readHandlePortably
raw <- openFileOrStdin f >>= readHandlePortably' mencoding
maybe (return raw) (\c -> runCommandAsFilter rulesfile (dbg0Msg ("running: "++c) c) raw) mc
-- no file pattern, but a data generating command
@ -247,12 +247,11 @@ parse iopts rulesfile h = do
error' $ rulesfile ++ " source rule must specify a file pattern or a command"
-- 6. convert the clean data to a (possibly empty) journal
-- needs: clean data, rules, rules file, data file if any
-- needs: clean data, rules, data file if any
-- gives: journal
j <- do
cleandatah <- liftIO $ inputToHandle cleandata
readJournalFromCsv (Just $ Left rules) (fromMaybe "(cmd)" mdatafile) cleandatah Nothing
readJournalFromCsv rules (fromMaybe "(cmd)" mdatafile) cleandata Nothing
-- apply any command line account aliases. Can fail with a bad replacement pattern.
>>= liftEither . journalApplyAliases (aliasesFromOpts iopts)
-- journalFinalise assumes the journal's items are
@ -389,15 +388,37 @@ dataFileFor = stripExtension "rules"
rulesFileFor :: FilePath -> FilePath
rulesFileFor = (++ ".rules")
-- | Return the given rules file path, or if none is given,
-- the default rules file for the given csv file;
-- or if the csv file is "-", raise an error.
getRulesFile :: FilePath -> Maybe FilePath -> FilePath
getRulesFile csvfile mrulesfile =
case mrulesfile of
Nothing | csvfile == "-" ->
error' "please use --rules when reading CSV from stdin" -- PARTIAL
-- XXX is this bad ? everything else here uses ExceptT
Nothing -> rulesFileFor csvfile
Just f -> f
-- | An exception-throwing IO action that reads and validates
-- the specified CSV rules file (which may include other rules files).
readRulesFile :: FilePath -> ExceptT String IO CsvRules
readRulesFile f =
readRules :: FilePath -> ExceptT String IO CsvRules
readRules f =
liftIO (do
dbg6IO "using conversion rules file" f
readFilePortably f >>= expandIncludes (takeDirectory f)
) >>= either throwError return . parseAndValidateCsvRules f
-- | Read the encoding specified by the @encoding@ rule, if any.
-- Or throw an error if an unrecognised encoding is specified.
rulesEncoding :: CsvRules -> ExceptT String IO (Maybe DynEncoding)
rulesEncoding rules = do
case T.unpack <$> getDirective "encoding" rules of
Nothing -> return Nothing
Just encstr -> case encodingFromStringExplicit $ dbg4 "encoding name" encstr of
Nothing -> throwError $ "Invalid encoding: " <> encstr
Just enc -> return . Just $ dbg4 "encoding" enc
-- | Inline all files referenced by include directives in this hledger CSV rules text, recursively.
-- Included file paths may be relative to the directory of the provided file path.
-- Unlike with journal files, this is done as a pre-parse step to simplify the CSV rules parser.
@ -1167,27 +1188,12 @@ _CSV_READING__________________________________________ = undefined
--
-- 4. Return the transactions as a Journal.
--
readJournalFromCsv :: Maybe (Either CsvRules FilePath) -> FilePath -> Handle -> Maybe SepFormat -> ExceptT String IO Journal
readJournalFromCsv Nothing "-" h _ = lift (hClose h) *> throwError "please use --rules when reading CSV from stdin"
readJournalFromCsv merulesfile csvfile csvhandle sep = do
readJournalFromCsv :: CsvRules -> FilePath -> Text -> Maybe SepFormat -> ExceptT String IO Journal
readJournalFromCsv rules csvfile csvtext sep = do
-- for now, correctness is the priority here, efficiency not so much
rules <- case merulesfile of
Just (Left rs) -> return rs
Just (Right rulesfile) -> readRulesFile rulesfile
Nothing -> readRulesFile $ rulesFileFor csvfile
dbg6IO "csv rules" rules
-- read csv while being aware of the encoding
mencoding <- do
-- XXX higher-than usual debug level for file reading to bypass excessive noise from elsewhere, normally 6 or 7
case T.unpack <$> getDirective "encoding" rules of
Just rawenc -> case encodingFromStringExplicit $ dbg4 "raw-encoding" rawenc of
Just enc -> return . Just $ dbg4 "encoding" enc
Nothing -> throwError $ "Invalid encoding: " <> rawenc
Nothing -> return Nothing
csvtext <- lift $ readHandlePortably' mencoding csvhandle
-- convert the csv data to lines and remove all empty/blank lines
let csvlines1 = dbg9 "csvlines1" $ filter (not . T.null . T.strip) $ dbg9 "csvlines0" $ T.lines csvtext

View File

@ -0,0 +1,18 @@
# * CSV encoding tests
# ** 1. The encoding rule works when reading the csv file as input.
$ hledger -f t.iso8859-1.csv print
2025-01-01 éclair
expenses:unknown 1
income:unknown -1
>=
# ** 2. The encoding rule works when reading rules file as input. [#2465]
$ hledger -f t.iso8859-1.csv.rules print
2025-01-01 éclair
expenses:unknown 1
income:unknown -1
>=

View File

@ -0,0 +1 @@
2025-01-01, éclair, 1
1 2025-01-01 éclair 1

View File

@ -0,0 +1,3 @@
source ./t.iso8859-1.csv
encoding iso8859-1
fields date, description, amount