imp: show a clearer error message on unicode en/decoding failures [#73]

A very long-awaited improvement: for unicode exceptions, and I/O
exceptions which look like they were caused by a unicode error
(usually text decoding failure), our error message now includes
an explanation and advice on what to do.

Currently this uses the GHC.IO.Encoding API, which is not ideal:
"The API of this module is unstable and not meant to be consumed by
the general public. If you absolutely must depend on it, make sure to
use a tight upper bound, e.g., base < 4.X rather than base < 5,
because the interface can change rapidly without much warning."

Also it relies on scanning for patterns in GHC's various
unicode-related error messages, which may not be complete and could
change in future. To do: try the encoding package's IO helpers,
perhaps they give more specific exceptions.
This commit is contained in:
Simon Michael 2025-04-25 08:50:53 -10:00
parent 06cd0f9472
commit 45d5fd7963

View File

@ -131,6 +131,7 @@ import Data.List hiding (uncons)
import Data.Maybe (isJust, catMaybes) import Data.Maybe (isJust, catMaybes)
import Data.Ord (comparing, Down (Down)) import Data.Ord (comparing, Down (Down))
import qualified Data.Text as T import qualified Data.Text as T
import Data.Text.Encoding.Error (UnicodeException)
import qualified Data.Text.IO as T import qualified Data.Text.IO as T
import qualified Data.Text.Lazy as TL import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.Builder as TB import qualified Data.Text.Lazy.Builder as TB
@ -139,6 +140,7 @@ import Data.Time.LocalTime (LocalTime, ZonedTime, getCurrentTimeZone,
import Data.Word (Word16) import Data.Word (Word16)
import Debug.Trace import Debug.Trace
import Foreign.C.Error (Errno(..), ePIPE) import Foreign.C.Error (Errno(..), ePIPE)
import GHC.IO.Encoding (getLocaleEncoding, textEncodingName)
import GHC.IO.Exception (IOException(..), IOErrorType (ResourceVanished)) import GHC.IO.Exception (IOException(..), IOErrorType (ResourceVanished))
import Language.Haskell.TH.Syntax (Q, Exp) import Language.Haskell.TH.Syntax (Q, Exp)
import Safe (headMay, maximumDef) import Safe (headMay, maximumDef)
@ -159,7 +161,6 @@ import Text.Pretty.Simple (CheckColorTty(..), OutputOptions(..), defau
import Hledger.Utils.Text (WideBuilder(WideBuilder)) import Hledger.Utils.Text (WideBuilder(WideBuilder))
-- Pretty showing/printing -- Pretty showing/printing
-- using pretty-simple -- using pretty-simple
@ -236,48 +237,120 @@ ansiFormatWarning = (<> sgrresetall) . ((sgrbrightyellow <> sgrbold) <>)
modifyFirstLine :: (String -> String) -> String -> String modifyFirstLine :: (String -> String) -> String -> String
modifyFirstLine f s = intercalate "\n" $ map f l <> ls where (l,ls) = splitAt 1 $ lines s -- total modifyFirstLine f s = intercalate "\n" $ map f l <> ls where (l,ls) = splitAt 1 $ lines s -- total
-- | This wraps a program's main routine, to handle program-terminating exceptions {- | Print an error message to stderr, with a consistent "programname: " prefix,
-- in a consistent compiler-independent way. and applying ANSI styling (bold bright red) to the first line if that is supported and allowed.
-- In particular it catches @error'@/@error@/@errorWithoutStackTrace@ calls and IO errors, -}
-- strips any outer whitespace from their output, and passes this to exitWithErrorMsg.
-- This prevents GHC 9.10's trailing newlines and GHC 9.12's "uncaught exception" output,
-- while exitWithErrorMsg ensures a consistent prefix, and styles the first line if allowed.
--
-- Some program-terminating exceptions this does not handle:
-- ExitCode (triggered by a call to exitSuccess, exitFailure, or exitWith)
-- and UserInterrupt (triggered by control-C).
--
[Handler (\(e::ErrorCall) -> exitWithError $ rstrip $ show e)
,Handler (\(e::IOError) -> exitWithError $ rstrip $ show e)
-- ,Handler (\(x::ExitCode) -> exitWith x) -- falls through
-- ,Handler (\UserInterrupt -> exitFailure) -- falls through
]
where
rstrip = reverse . dropWhile isSpace . reverse
-- | Print an error message with printError,
-- then exit the program with a non-zero exit code.
-- | Print an error message to stderr,
-- with a standard program name prefix,
-- and styling the first line with ansiFormatError if that's allowed.
printError :: String -> IO () printError :: String -> IO ()
printError msg = do printError msg = do
progname <- getProgName progname <- getProgName
usecolor <- useColorOnStderr usecolor <- useColorOnStderr
let let
style = if usecolor then modifyFirstLine ansiFormatError else id style = if usecolor then modifyFirstLine ansiFormatError else id
prefix = progname <> ": " prefix =
-- error' prepends an "Error: " prefix. But that seems to have been removed when I catch the ErrorCall exception - unless I'm running in GHCI. progname
-- Is it possible something in GHC or base is removing it ? <> ": "
-- Use a stupid heuristic for now: add it again unless already there. -- error' prepends an "Error: " prefix. But that seems to have been removed when I catch the ErrorCall exception - unless I'm running in GHCI.
<> (if "Error:" `isPrefixOf` msg then "" else "Error: ") -- Is it possible something in GHC or base is removing it ?
-- Use a stupid heuristic for now: add it again unless already there.
<> (if "Error:" `isPrefixOf` msg then "" else "Error: ")
hPutStrLn stderr $ style $ prefix <> msg hPutStrLn stderr $ style $ prefix <> msg
{- | Print an error message with printError,
then exit the program with a non-zero exit code.
-}
exitWithErrorMessage :: String -> IO () exitWithErrorMessage :: String -> IO ()
exitWithErrorMessage msg = printError msg >> exitFailure exitWithErrorMessage msg = printError msg >> exitFailure
-- | This wraps a program's main routine so as to display more consistent
-- and useful error output for some common program-terminating exceptions,
-- independent of compiler version. It catches:
--
-- - UnicodeException - unicode errors in pure code
--
-- - IOException AKA IOError - I/O errors, including unicode errors during I/O
--
-- - ErrorCall - @error@ / @errorWithoutStackTrace@ calls
--
-- and:
--
-- - removes the trailing newlines added by some GHC 9.10.*
--
-- - removes "uncaught exception" output added by some GHC 9.12.*
--
-- - ensures a consistent "programname: " prefix
--
-- - applies ANSI styling (bold bright red) to the first line if that is supported and allowed
--
-- - for unicode exceptions, and I/O exceptions which look like they were
-- caused by a unicode error (usually text decoding failure),
-- it adds (english) text explaining the problem and what to do.
--
-- Some exceptions this does not handle:
-- ExitCode (exitSuccess / exitFailure / exitWith calls)
-- and UserInterrupt (control-C).
--
exitOnError :: IO () -> IO () exitOnError :: IO () -> IO ()
exitOnError = flip catches exitOnError = flip catches
[Handler (\(e::UnicodeException) -> exitUnicode e)
,Handler (\(e::IOException) -> if isUnicodeError e then exitUnicode e else exitOther e)
,Handler (\(e::ErrorCall) -> exitOther e)
]
where
isUnicodeError :: Exception e => e -> Bool
isUnicodeError ex = any (`isInfixOf` msg) unicodeerrorpatterns
where
msg = map toLower $ show ex
unicodeerrorpatterns = [ -- keep updated
"illegal byte sequence"
, "invalid byte sequence"
, "invalid character"
, "invalid or incomplete multibyte"
, "mkTextEncoding: invalid argument"
]
exitUnicode :: Exception e => e -> IO ()
exitUnicode ex = do
enc <- getSystemEncoding
let
noencoding = map toLower enc == "ascii"
msg = unlines $ [
rstrip $ show ex
, "Some text could not be decoded/encoded with the system text encoding: " <> enc
] ++
if noencoding
then [
"Please configure a system locale with a text encoding to handle non-ascii text"
]
else [
-- advice suitable for programs which always use the system text encoding:
"Please convert all data to the system encoding (eg with iconv),"
, "or configure the system encoding to match your data (eg by setting LANG)."
]
exitWithErrorMessage msg
exitOther :: Exception e => e -> IO ()
exitOther = exitWithErrorMessage . rstrip . show
rstrip = reverse . dropWhile isSpace . reverse
-- I18n
-- encoding has a similar getSystemEncoding :: IO (Maybe DynEncoding)
-- but it returns Nothing on Windows or if there's an error.
-- | Get the name of the text encoding used by the current locale, using GHC's API.
getSystemEncoding :: IO String
getSystemEncoding = do
localeEncoding <- getLocaleEncoding
return $ textEncodingName localeEncoding
-- -- | Get the name of the text encoding currently configured for stdout, using GHC's API.
-- getStdoutEncoding :: IO (Maybe String)
-- getStdoutEncoding = do
-- mEncoding <- hGetEncoding stdout
-- return $ fmap textEncodingName mEncoding
-- Time -- Time
getCurrentLocalTime :: IO LocalTime getCurrentLocalTime :: IO LocalTime