feat:csv: add an encoding rule, allowing non-UTF8 CSV to be read [#2319]

Previously, hledger could read CSV files containing non-ascii
characters only if they are UTF8-encoded.  Now there is a new CSV
rule, encoding ENCODING, which allows reading CSV files with other
encodings.

This adds a dependency on the encoding library, which supports fewer
encodings than text-icu but does not require a third-party C library.
To avoid build issues on various platforms, we require version 0.10+.

This adds some use of the ImplicitParams language extension, required
by encoding's API, but only in a small code region.

This also changes the type of Reader's rReadFn; it now takes
a `Handle` rather than a `Text`, allowing more flexibility.
This commit is contained in:
Joschua Kesper 2025-01-21 22:05:37 +01:00 committed by Simon Michael
parent d68a832d1c
commit 5114962b2a
26 changed files with 220 additions and 56 deletions

View File

@ -102,6 +102,7 @@ module Hledger.Read (
-- * Easy journal parsing
readJournal',
readJournal'',
readJournalFile',
readJournalFiles',
orDieTrying,
@ -125,7 +126,7 @@ module Hledger.Read (
--- ** imports
import qualified Control.Exception as C
import Control.Monad (unless, when, forM)
import Control.Monad (unless, when, forM, (<=<))
import "mtl" Control.Monad.Except (ExceptT(..), runExceptT, liftEither)
import Control.Monad.IO.Class (MonadIO, liftIO)
import Data.Default (def)
@ -145,7 +146,7 @@ import System.Environment (getEnv)
import System.Exit (exitFailure)
import System.FilePath ((<.>), (</>), splitDirectories, splitFileName, takeFileName)
import System.Info (os)
import System.IO (hPutStr, stderr)
import System.IO (Handle, hPutStr, stderr)
import Hledger.Data.Dates (getCurrentDay, parsedateM, showDate)
import Hledger.Data.Types
@ -205,7 +206,7 @@ type PrefixedFilePath = FilePath
-- | @readJournal iopts mfile txt@
--
-- Read a Journal from some text, with strict checks if enabled,
-- Read a Journal from some handle, with strict checks if enabled,
-- or return an error message.
--
-- The reader (data format) is chosen based on, in this order:
@ -219,11 +220,11 @@ type PrefixedFilePath = FilePath
-- If none of these is available, or if the reader name is unrecognised,
-- we use the journal reader (for predictability).
--
readJournal :: InputOpts -> Maybe FilePath -> Text -> ExceptT String IO Journal
readJournal iopts@InputOpts{strict_, _defer} mpath txt = do
readJournal :: InputOpts -> Maybe FilePath -> Handle -> ExceptT String IO Journal
readJournal iopts@InputOpts{strict_, _defer} mpath hdl = do
let r :: Reader IO = fromMaybe JournalReader.reader $ findReader (mformat_ iopts) mpath
dbg6IO "readJournal: trying reader" (rFormat r)
j <- rReadFn r iopts (fromMaybe "(string)" mpath) txt
j <- rReadFn r iopts (fromMaybe "(string)" mpath) hdl
when (strict_ && not _defer) $ liftEither $ journalStrictChecks j
return j
@ -264,11 +265,11 @@ readJournalFileAndLatestDates iopts prefixedfile = do
(mfmt, f) = splitReaderPrefix prefixedfile
iopts' = iopts{mformat_=asum [mfmt, mformat_ iopts]}
liftIO $ requireJournalFileExists f
t <-
h <-
traceOrLogAt 6 ("readJournalFile: "++takeFileName f) $
liftIO $ readFileOrStdinPortably f
liftIO $ openFileOrStdin f
-- <- T.readFile f -- or without line ending translation, for testing
j <- readJournal iopts' (Just f) t
j <- readJournal iopts' (Just f) h
if new_ iopts
then do
ds <- liftIO $ previousLatestDates f
@ -313,9 +314,14 @@ readJournalFilesAndLatestDates iopts pfs = do
-- | An easy version of 'readJournal' which assumes default options, and fails
-- in the IO monad.
readJournal' :: Text -> IO Journal
readJournal' :: Handle -> IO Journal
readJournal' = orDieTrying . readJournal definputopts Nothing
-- | An even easier version of 'readJournal' which additionally to 'readJournal''
-- also takes a 'Text' instead of a 'Handle'.
readJournal'' :: Text -> IO Journal
readJournal'' = readJournal' <=< inputToHandle
-- | An easy version of 'readJournalFile' which assumes default options, and fails
-- in the IO monad.
readJournalFile' :: PrefixedFilePath -> IO Journal

View File

@ -34,6 +34,7 @@ module Hledger.Read.Common (
HasInputOpts(..),
definputopts,
rawOptsToInputOpts,
handleReadFnToTextReadFn,
-- * parsing utilities
parseAndFinaliseJournal,
@ -148,6 +149,7 @@ import Data.Time.Clock.POSIX (getPOSIXTime)
import Data.Time.LocalTime (LocalTime(..), TimeOfDay(..))
import Data.Word (Word8)
import System.FilePath (takeFileName)
import System.IO (Handle)
import Text.Megaparsec
import Text.Megaparsec.Char (char, char', digitChar, newline, string)
import Text.Megaparsec.Char.Lexer (decimal)
@ -179,9 +181,9 @@ data Reader m = Reader {
,rExtensions :: [String]
-- The entry point for reading this format, accepting input options, file
-- path for error messages and file contents, producing an exception-raising IO
-- path for error messages and file contents via the handle, producing an exception-raising IO
-- action that produces a journal or error message.
,rReadFn :: InputOpts -> FilePath -> Text -> ExceptT String IO Journal
,rReadFn :: InputOpts -> FilePath -> Handle -> ExceptT String IO Journal
-- The actual megaparsec parser called by the above, in case
-- another parser (includedirectivep) wants to use it directly.
@ -231,6 +233,10 @@ rawOptsToInputOpts day usecoloronstdout postingaccttags rawopts =
,_ioDay = day
}
handleReadFnToTextReadFn :: (InputOpts -> FilePath -> Text -> ExceptT String IO Journal) -> InputOpts -> FilePath -> Handle -> ExceptT String IO Journal
handleReadFnToTextReadFn p iopts fp =
p iopts fp <=< lift . readHandlePortably
-- | Get the date span from --forecast's PERIODEXPR argument, if any.
-- This will fail with a usage error if the period expression cannot be parsed,
-- or if it contains a report interval.

View File

@ -28,7 +28,7 @@ where
import Prelude hiding (Applicative(..))
import Control.Monad.Except (ExceptT(..), liftEither)
import Control.Monad.IO.Class (MonadIO)
import Data.Text (Text)
import System.IO (Handle)
import Hledger.Data
import Hledger.Utils
@ -54,10 +54,10 @@ reader sep = Reader
-- This file path is normally the CSV(/SSV/TSV) data file, and a corresponding rules file is inferred.
-- But it can also be the rules file, in which case the corresponding data file is inferred.
-- This does not check balance assertions.
parse :: SepFormat -> InputOpts -> FilePath -> Text -> ExceptT String IO Journal
parse sep iopts f t = do
parse :: SepFormat -> InputOpts -> FilePath -> Handle -> ExceptT String IO Journal
parse sep iopts f h = do
let mrulesfile = mrules_file_ iopts
readJournalFromCsv (Right <$> mrulesfile) f t (Just sep)
readJournalFromCsv (Right <$> mrulesfile) f h (Just sep)
-- apply any command line account aliases. Can fail with a bad replacement pattern.
>>= liftEither . journalApplyAliases (aliasesFromOpts iopts)
-- journalFinalise assumes the journal's items are
@ -65,7 +65,7 @@ parse sep iopts f t = do
-- But here they are already properly ordered. So we'd
-- better preemptively reverse them once more. XXX inefficient
. journalReverse
>>= journalFinalise iopts{balancingopts_=(balancingopts_ iopts){ignore_assertions_=True}} f t
>>= journalFinalise iopts{balancingopts_=(balancingopts_ iopts){ignore_assertions_=True}} f ""
--- ** tests

View File

@ -194,7 +194,7 @@ reader :: MonadIO m => Reader m
reader = Reader
{rFormat = Journal'
,rExtensions = ["journal", "j", "hledger", "ledger"]
,rReadFn = parse
,rReadFn = handleReadFnToTextReadFn parse
,rParser = journalp -- no need to add command line aliases like journalp'
-- when called as a subparser I think
}

View File

@ -52,6 +52,7 @@ import Control.Monad.State.Strict (StateT, get, modify', evalStateT)
import Control.Monad.Trans.Class (lift)
import Data.Char (toLower, isDigit, isSpace, isAlphaNum, ord)
import Data.Bifunctor (first)
import Data.Encoding (encodingFromStringExplicit)
import Data.Functor ((<&>))
import Data.List (elemIndex, mapAccumL, nub, sortOn)
#if !MIN_VERSION_base(4,20,0)
@ -69,6 +70,7 @@ import Data.Time ( Day, TimeZone, UTCTime, LocalTime, ZonedTime(ZonedTime),
defaultTimeLocale, getCurrentTimeZone, localDay, parseTimeM, utcToLocalTime, localTimeToUTC, zonedTimeToUTC)
import Safe (atMay, headMay, lastMay, readMay)
import System.FilePath ((</>), takeDirectory, takeExtension, stripExtension, takeFileName)
import System.IO (Handle, hClose)
import qualified Data.Csv as Cassava
import qualified Data.Csv.Parser.Megaparsec as CassavaMegaparsec
import qualified Data.ByteString as B
@ -116,10 +118,11 @@ getDownloadDir = do
-- file's directory. When a glob pattern matches multiple files, the alphabetically
-- last is used. (Eg in case of multiple numbered downloads, the highest-numbered
-- will be used.)
-- The provided text, or a --rules option, are ignored by this reader.
-- The provided handle, or a --rules option, are ignored by this reader.
-- Balance assertions are not checked.
parse :: InputOpts -> FilePath -> Text -> ExceptT String IO Journal
parse iopts f _ = do
parse :: InputOpts -> FilePath -> Handle -> ExceptT String IO Journal
parse iopts f h = do
lift $ hClose h -- We don't need it
rules <- readRulesFile $ dbg4 "reading rules file" f
-- XXX higher-than usual debug level for file reading to bypass excessive noise from elsewhere, normally 6 or 7
mdatafile <- liftIO $ do
@ -139,8 +142,8 @@ parse iopts f _ = do
if not (dat=="-" || exists)
then return nulljournal -- data file inferred from rules file name was not found
else do
t <- liftIO $ readFileOrStdinPortably dat
readJournalFromCsv (Just $ Left rules) dat t Nothing
dath <- liftIO $ openFileOrStdin dat
readJournalFromCsv (Just $ Left rules) dat dath Nothing
-- apply any command line account aliases. Can fail with a bad replacement pattern.
>>= liftEither . journalApplyAliases (aliasesFromOpts iopts)
-- journalFinalise assumes the journal's items are
@ -500,6 +503,7 @@ directivep = (do
directives :: [Text]
directives =
["source"
,"encoding"
,"date-format"
,"decimal-mark"
,"separator"
@ -908,9 +912,9 @@ _CSV_READING__________________________________________ = undefined
--
-- 4. Return the transactions as a Journal.
--
readJournalFromCsv :: Maybe (Either CsvRules FilePath) -> FilePath -> Text -> Maybe SepFormat -> ExceptT String IO Journal
readJournalFromCsv Nothing "-" _ _ = throwError "please use --rules when reading CSV from stdin"
readJournalFromCsv merulesfile csvfile csvtext sep = do
readJournalFromCsv :: Maybe (Either CsvRules FilePath) -> FilePath -> Handle -> Maybe SepFormat -> ExceptT String IO Journal
readJournalFromCsv Nothing "-" h _ = lift (hClose h) *> throwError "please use --rules when reading CSV from stdin"
readJournalFromCsv merulesfile csvfile csvhandle sep = do
-- for now, correctness is the priority here, efficiency not so much
rules <- case merulesfile of
@ -919,6 +923,16 @@ readJournalFromCsv merulesfile csvfile csvtext sep = do
Nothing -> readRulesFile $ rulesFileFor csvfile
dbg6IO "csv rules" rules
-- read csv while being aware of the encoding
mencoding <- do
-- XXX higher-than usual debug level for file reading to bypass excessive noise from elsewhere, normally 6 or 7
case T.unpack <$> getDirective "encoding" rules of
Just rawenc -> case encodingFromStringExplicit $ dbg4 "raw-encoding" rawenc of
Just enc -> return . Just $ dbg4 "encoding" enc
Nothing -> throwError $ "Invalid encoding: " <> rawenc
Nothing -> return Nothing
csvtext <- lift $ readHandlePortably' mencoding csvhandle
-- convert the csv data to lines and remove all empty/blank lines
let csvlines1 = dbg9 "csvlines1" $ filter (not . T.null . T.strip) $ dbg9 "csvlines0" $ T.lines csvtext

View File

@ -79,7 +79,7 @@ reader :: MonadIO m => Reader m
reader = Reader
{rFormat = Timeclock
,rExtensions = ["timeclock"]
,rReadFn = parse
,rReadFn = handleReadFnToTextReadFn parse
,rParser = timeclockfilep
}

View File

@ -68,7 +68,7 @@ reader :: MonadIO m => Reader m
reader = Reader
{rFormat = Timedot
,rExtensions = ["timedot"]
,rReadFn = parse
,rReadFn = handleReadFnToTextReadFn parse
,rParser = timedotp
}

View File

@ -301,7 +301,7 @@ tests_PostingsReport = testGroup "PostingsReport" [
,"postings report with cleared option" ~:
do
let opts = defreportopts{cleared_=True}
j <- readJournal' sample_journal_str
j <- readJournal'' sample_journal_str
(postingsReportAsText opts $ postingsReport opts (queryFromOpts date1 opts) j) `is` unlines
["2008/06/03 eat & shop expenses:food $1 $1"
," expenses:supplies $1 $2"
@ -313,7 +313,7 @@ tests_PostingsReport = testGroup "PostingsReport" [
,"postings report with uncleared option" ~:
do
let opts = defreportopts{uncleared_=True}
j <- readJournal' sample_journal_str
j <- readJournal'' sample_journal_str
(postingsReportAsText opts $ postingsReport opts (queryFromOpts date1 opts) j) `is` unlines
["2008/01/01 income assets:bank:checking $1 $1"
," income:salary $-1 0"
@ -325,7 +325,7 @@ tests_PostingsReport = testGroup "PostingsReport" [
,"postings report sorts by date" ~:
do
j <- readJournal' $ unlines
j <- readJournal'' $ unlines
["2008/02/02 a"
," b 1"
," c"

View File

@ -4,7 +4,9 @@ pretty-printing haskell values, error reporting, time, files, command line parsi
terminals, pager output, ANSI colour/styles, etc.
-}
{-# LANGUAGE ImplicitParams #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE PackageImports #-}
{-# LANGUAGE ScopedTypeVariables #-}
@ -31,11 +33,15 @@ module Hledger.Utils.IO (
expandPath,
expandGlob,
sortByModTime,
openFileOrStdin,
readFileOrStdinPortably,
readFileOrStdinPortably',
readFileStrictly,
readFilePortably,
readHandlePortably,
readHandlePortably',
-- hereFileRelative,
inputToHandle,
-- * Command line parsing
progArgs,
@ -111,6 +117,7 @@ import Data.Char (toLower)
import Data.Colour.RGBSpace (RGB(RGB))
import Data.Colour.RGBSpace.HSL (lightness)
import Data.Colour.SRGB (sRGB)
import Data.Encoding (DynEncoding)
import Data.FileEmbed (makeRelativeToProject, embedStringFile)
import Data.Functor ((<&>))
import Data.List hiding (uncons)
@ -136,8 +143,9 @@ import System.FilePath (isRelative, (</>))
import "Glob" System.FilePath.Glob (glob)
import System.Info (os)
import System.IO (Handle, IOMode (..), hGetEncoding, hSetEncoding, hSetNewlineMode, openFile, stdin, stdout, stderr, universalNewlineMode, utf8_bom, hIsTerminalDevice, hPutStr, hClose)
import qualified System.IO.Encoding as Enc
import System.IO.Unsafe (unsafePerformIO)
import System.Process (CreateProcess(..), StdStream(CreatePipe), shell, waitForProcess, withCreateProcess)
import System.Process (CreateProcess(..), StdStream(CreatePipe), createPipe, shell, waitForProcess, withCreateProcess)
import Text.Pretty.Simple (CheckColorTty(..), OutputOptions(..), defaultOutputOptionsDarkBg, defaultOutputOptionsNoColor, pShowOpt, pPrintOpt)
import Hledger.Utils.Text (WideBuilder(WideBuilder))
@ -280,19 +288,39 @@ readFilePortably f = openFile f ReadMode >>= readHandlePortably
-- | Like readFilePortably, but read from standard input if the path is "-".
readFileOrStdinPortably :: String -> IO T.Text
readFileOrStdinPortably f = openFileOrStdin f ReadMode >>= readHandlePortably
where
openFileOrStdin :: String -> IOMode -> IO Handle
openFileOrStdin "-" _ = return stdin
openFileOrStdin f' m = openFile f' m
readFileOrStdinPortably = readFileOrStdinPortably' Nothing
-- | Like readFileOrStdinPortably, but take an optional converter.
readFileOrStdinPortably' :: Maybe DynEncoding -> String -> IO T.Text
readFileOrStdinPortably' c f = openFileOrStdin f >>= readHandlePortably' c
openFileOrStdin :: String -> IO Handle
openFileOrStdin "-" = return stdin
openFileOrStdin f' = openFile f' ReadMode
readHandlePortably :: Handle -> IO T.Text
readHandlePortably h = do
readHandlePortably = readHandlePortably' Nothing
readHandlePortably' :: Maybe DynEncoding -> Handle -> IO T.Text
readHandlePortably' Nothing h = do
hSetNewlineMode h universalNewlineMode
menc <- hGetEncoding h
when (fmap show menc == Just "UTF-8") $ -- XXX no Eq instance, rely on Show
hSetEncoding h utf8_bom
T.hGetContents h
readHandlePortably' (Just e) h =
-- We need to manually apply the newline mode
-- Since we already have a Text
T.replace "\r\n" "\n" . T.pack <$> let ?enc = e in Enc.hGetContents h
inputToHandle :: T.Text -> IO Handle
inputToHandle t = do
(r, w) <- createPipe
hSetEncoding r utf8_bom
hSetEncoding w utf8_bom
T.hPutStr w t
hClose w
return r
-- | Like embedFile, but takes a path relative to the package directory.
embedFileRelative :: FilePath -> Q Exp

View File

@ -141,6 +141,7 @@ library
, deepseq
, directory >=1.2.6.1
, doclayout >=0.3 && <0.6
, encoding >=0.10
, extra >=1.6.3
, file-embed >=0.0.10
, filepath
@ -201,6 +202,7 @@ test-suite doctest
, directory >=1.2.6.1
, doclayout >=0.3 && <0.6
, doctest >=0.18.1
, encoding >=0.10
, extra >=1.6.3
, file-embed >=0.0.10
, filepath
@ -262,6 +264,7 @@ test-suite unittest
, deepseq
, directory >=1.2.6.1
, doclayout >=0.3 && <0.6
, encoding >=0.10
, extra >=1.6.3
, file-embed >=0.0.10
, filepath

View File

@ -59,6 +59,7 @@ dependencies:
- Decimal >=0.5.1
- directory >=1.2.6.1
- doclayout >=0.3 && <0.6
- encoding >=0.10
- file-embed >=0.0.10
- filepath
- hashtables >=1.2.3.1

View File

@ -131,7 +131,7 @@ hledgerWebTest = do
rawopts = [("forecast","")]
iopts = rawOptsToInputOpts d usecolor True $ mkRawOpts rawopts
f = "fake" -- need a non-null filename so forecast transactions get index 0
pj <- readJournal' (T.pack $ unlines -- PARTIAL: readJournal' should not fail
pj <- readJournal'' (T.pack $ unlines -- PARTIAL: readJournal'' should not fail
["~ monthly"
," assets 10"
," income"

View File

@ -66,7 +66,7 @@ writeJournalTextIfValidAndChanged f t = mapExceptT liftIO $ do
-- formatdirectivep, #1194) writeFileWithBackupIfChanged require them.
-- XXX klunky. Any equivalent of "hSetNewlineMode h universalNewlineMode" for form posts ?
let t' = T.replace "\r" "" t
j <- readJournal definputopts (Just f) t'
j <- readJournal definputopts (Just f) =<< liftIO (inputToHandle t')
_ <- liftIO $ j `seq` writeFileWithBackupIfChanged f t' -- Only write backup if the journal didn't error
return ()

View File

@ -391,8 +391,8 @@ tests_Commands = testGroup "Commands" [
let
ignoresourcepos j = j{jtxns=map (\t -> t{tsourcepos=nullsourcepospair}) (jtxns j)}
sameParse str1 str2 = do
j1 <- ignoresourcepos <$> readJournal' str1 -- PARTIAL:
j2 <- ignoresourcepos <$> readJournal' str2 -- PARTIAL:
j1 <- ignoresourcepos <$> readJournal'' str1 -- PARTIAL:
j2 <- ignoresourcepos <$> readJournal'' str2 -- PARTIAL:
j1 @?= j2{jlastreadtime=jlastreadtime j1, jfiles=jfiles j1} --, jparsestate=jparsestate j1}
sameParse
("2008/12/07 One\n alpha $-1\n beta $1\n" <>
@ -409,19 +409,19 @@ tests_Commands = testGroup "Commands" [
)
,testCase "preserves \"virtual\" posting type" $ do
j <- readJournal' "apply account test\n2008/12/07 One\n (from) $-1\n (to) $1\n" -- PARTIAL:
j <- readJournal'' "apply account test\n2008/12/07 One\n (from) $-1\n (to) $1\n" -- PARTIAL:
let p = headErr $ tpostings $ headErr $ jtxns j -- PARTIAL headErrs succeed because txns & postings provided
paccount p @?= "test:from"
ptype p @?= VirtualPosting
]
,testCase "alias directive" $ do
j <- readJournal' "!alias expenses = equity:draw:personal\n1/1\n (expenses:food) 1\n" -- PARTIAL:
j <- readJournal'' "!alias expenses = equity:draw:personal\n1/1\n (expenses:food) 1\n" -- PARTIAL:
let p = headErr $ tpostings $ headErr $ jtxns j -- PARTIAL headErrs succeed because txns & postings provided
paccount p @?= "equity:draw:personal:food"
,testCase "Y default year directive" $ do
j <- readJournal' defaultyear_journal_txt -- PARTIAL:
j <- readJournal'' defaultyear_journal_txt -- PARTIAL:
tdate (headErr $ jtxns j) @?= fromGregorian 2009 1 1 -- PARTIAL headErr succeeds because defaultyear_journal_txt has a txn
,testCase "ledgerAccountNames" $
@ -454,7 +454,7 @@ tests_Commands = testGroup "Commands" [
-- t1 = LocalTime date1 midday
{-
samplejournal = readJournal' sample_journal_str
samplejournal = readJournal'' sample_journal_str
sample_journal_str = unlines
["; A sample journal file."

View File

@ -459,7 +459,7 @@ ensureOneNewlineTerminated = (<>"\n") . T.dropWhileEnd (=='\n')
-- | Convert a string of journal data into a register report.
registerFromString :: T.Text -> IO TL.Text
registerFromString s = do
j <- readJournal' s
j <- readJournal'' s
return . postingsReportAsText opts $ postingsReport rspec j
where
ropts = defreportopts{empty_=True}

View File

@ -1226,7 +1226,7 @@ tests_Balance = testGroup "Balance" [
testGroup "balanceReportAsText" [
testCase "unicode in balance layout" $ do
j <- readJournal' "2009/01/01 * медвежья шкура\n расходы:покупки 100\n актив:наличные\n"
j <- readJournal'' "2009/01/01 * медвежья шкура\n расходы:покупки 100\n актив:наличные\n"
let rspec = defreportspec{_rsReportOpts=defreportopts{no_total_=True}}
TB.toLazyText (balanceReportAsText (_rsReportOpts rspec) (balanceReport rspec{_rsDay=fromGregorian 2008 11 26} j))
@?=

View File

@ -304,7 +304,7 @@ tests_Register = testGroup "Register" [
testGroup "postingsReportAsText" [
testCase "unicode in register layout" $ do
j <- readJournal' "2009/01/01 * медвежья шкура\n расходы:покупки 100\n актив:наличные\n"
j <- readJournal'' "2009/01/01 * медвежья шкура\n расходы:покупки 100\n актив:наличные\n"
let rspec = defreportspec
(TL.unpack . postingsReportAsText defcliopts $ postingsReport rspec j)
@?=

View File

@ -106,7 +106,7 @@ For more about how to do that on your system, see [Common tasks > Setting LEDGER
## Text encoding
Data files containing non-ascii characters must use UTF-8 encoding.
Data files containing non-ascii characters must use UTF-8 encoding, with the exception being `csv` files (see [`encoding`](#encoding) below).
An optional [byte order mark (BOM)](https://www.unicode.org/faq/utf_bom.html#BOM) is allowed, at the beginning of the file (only).
Also, your system should be configured with a locale that can decode UTF-8 text.
@ -114,8 +114,6 @@ On some unix systems, you may need set the `LANG` environment variable, eg.
You can read more about this in [Unicode characters](#unicode-characters), below.
On unix systems you can check a file's encoding with the `file` command.
If you need to import from a UTF-16-encoded CSV file, say,
you can convert it to UTF-8 with the `iconv` command.
## Data formats
@ -3217,6 +3215,7 @@ The following kinds of rule can appear in the rules file, in any order.
| | |
|-------------------------------------------------|------------------------------------------------------------------------------------------------|
| [**`source`**](#source) | optionally declare which file to read data from |
| [**`encoding`**](#encoding) | optionally declare which encoding the data has |
| [**`separator`**](#separator) | declare the field separator, instead of relying on file extension |
| [**`skip`**](#skip) | skip one or more header lines at start of file |
| [**`date-format`**](#date-format) | declare how to parse CSV dates/date-times |
@ -3263,6 +3262,77 @@ source Checking1*.csv
See also ["Working with CSV > Reading files specified by rule"](#reading-files-specified-by-rule).
## `encoding`
```rules
encoding ENCODING
```
Specifying `encoding` followed by a valid encoding tells HLedger how to convert a
csv to be able to make use of it.
This is most often useful when getting a csv from a bank as they are sometimes
in an old encoding.
If none is given, `utf8` is assumed.
The encoding will be checked case-insensitive with some alternative spellings also allowed.
The full list of valid encodings is:
- ASCII
- UTF8
- UTF16
- UTF32
- ISO88591
- ISO88592
- ISO88593
- ISO88594
- ISO88595
- ISO88596
- ISO88597
- ISO88598
- ISO88599
- ISO885910
- ISO885911
- ISO885913
- ISO885914
- ISO885915
- ISO885916
- CP1250
- CP1251
- CP1252
- CP1253
- CP1254
- CP1255
- CP1256
- CP1257
- CP1258
- KOI8R
- KOI8U
- GB18030
- MacOSRoman
- JISX0201
- JISX0208
- ISO2022JP
- ShiftJIS
- CP437
- CP737
- CP775
- CP850
- CP852
- CP855
- CP857
- CP860
- CP861
- CP862
- CP863
- CP864
- CP865
- CP866
- CP869
- CP874
- CP932
Alternate spellings may be found in the [source code of `encoding`](https://hackage.haskell.org/package/encoding/docs/src/Data.Encoding.html#encodingFromStringExplicit)
## `separator`
You can use the `separator` rule to read other kinds of

View File

@ -12,10 +12,11 @@ packages:
# To minimise problems, use the versions shipped with GHC 9.10.1 when possible.
# See https://github.com/haskell/unix/issues/329.
# extra-deps:
extra-deps:
# - base-compat-0.14.0
# # for hledger-lib
# for hledger-lib
- encoding-0.10
# - Cabal-3.12.0.0
# - Cabal-syntax-3.12.0.0
# #- directory-1.3.8.3

View File

@ -31,6 +31,7 @@ extra-deps:
- text-builder-0.6.7
- text-builder-dev-0.3.3.2
- isomorphism-class-0.1.0.7
- encoding-0.10
# for hledger:
# silence a warning
- wizards-1.0.3@rev:3
@ -50,6 +51,10 @@ nix:
pure: false
packages: [perl gmp ncurses zlib]
flags:
encoding:
systemEncoding: false # See https://github.com/dmwit/encoding/issues/26
# ghc-options:
# "$locals": -fplugin Debug.Breakpoint

View File

@ -22,6 +22,7 @@ extra-deps:
- text-builder-0.6.7
- text-builder-dev-0.3.3.2
- isomorphism-class-0.1.0.7
- encoding-0.10
# for hledger:
# for hledger-ui:
- bimap-0.5.0
@ -39,6 +40,10 @@ nix:
pure: false
packages: [perl gmp ncurses zlib]
flags:
encoding:
systemEncoding: false # See https://github.com/dmwit/encoding/issues/26
# ghc-options:
# "$locals": -fplugin Debug.Breakpoint

View File

@ -18,7 +18,8 @@ packages:
extra-deps:
# - base-compat-0.14.0
# # for hledger-lib
# for hledger-lib
- encoding-0.10
# - Cabal-3.12.0.0
# - Cabal-syntax-3.12.0.0
# #- directory-1.3.8.3
@ -41,6 +42,10 @@ nix:
pure: false
packages: [perl gmp ncurses zlib]
flags:
encoding:
systemEncoding: false # See https://github.com/dmwit/encoding/issues/26
# ghc-options:
# "$locals": -Wno-x-partial
# "$locals": -fplugin Debug.Breakpoint

View File

@ -13,6 +13,7 @@ extra-deps:
- megaparsec-9.3.0
- safe-0.3.21
# for hledger-lib:
- encoding-0.10
# for hledger:
# for hledger-ui:
- brick-2.3.1
@ -29,6 +30,10 @@ nix:
pure: false
packages: [perl gmp ncurses zlib]
flags:
encoding:
systemEncoding: false # See https://github.com/dmwit/encoding/issues/26
# ghc-options:
# "$locals": -fplugin Debug.Breakpoint

View File

@ -16,11 +16,16 @@ extra-deps:
- vty-crossplatform-0.4.0.0
- vty-unix-0.2.0.0
- vty-windows-0.2.0.2
- encoding-0.10
nix:
pure: false
packages: [perl gmp ncurses zlib]
flags:
encoding:
systemEncoding: false # See https://github.com/dmwit/encoding/issues/26
# ghc-options:
# "$locals": -fplugin Debug.Breakpoint

View File

@ -10,12 +10,17 @@ packages:
extra-deps:
- base-compat-0.14.0
- vty-windows-0.2.0.1 # not yet in stackage
- vty-windows-0.2.0.1
- encoding-0.10
nix:
pure: false
packages: [perl gmp ncurses zlib]
flags:
encoding:
systemEncoding: false # See https://github.com/dmwit/encoding/issues/26
# ghc-options:
# "$locals": -fplugin Debug.Breakpoint

View File

@ -10,11 +10,16 @@ packages:
extra-deps:
- base-compat-0.14.0
- encoding-0.10
nix:
pure: false
packages: [perl gmp ncurses zlib]
flags:
encoding:
systemEncoding: false # See https://github.com/dmwit/encoding/issues/26
# ghc-options:
# "$locals": -Wno-x-partial
# "$locals": -fplugin Debug.Breakpoint