//===-- TextEncoding.cpp - Text encoding conversion class ---------*- C++ -*-=// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// /// \file /// This file provides utility classes to convert between different character /// encodings. /// //===----------------------------------------------------------------------===// #include "llvm/Support/TextEncoding.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Support/ConvertEBCDIC.h" #include #if HAVE_ICU #include #elif HAVE_ICONV #include #endif using namespace llvm; // Normalize the charset name with the charset alias matching algorithm proposed // in https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching. static void normalizeCharSetName(StringRef CSName, SmallVectorImpl &Normalized) { bool PrevDigit = false; for (auto Ch : CSName) { if (isAlnum(Ch)) { Ch = toLower(Ch); if (Ch != '0' || PrevDigit) { PrevDigit = isDigit(Ch); Normalized.push_back(Ch); } } } } // Maps the encoding name to enum constant if possible. static std::optional getKnownEncoding(StringRef Name) { SmallString<16> Normalized; normalizeCharSetName(Name, Normalized); if (Normalized.equals("utf8")) return TextEncoding::UTF8; if (Normalized.equals("ibm1047")) return TextEncoding::IBM1047; return std::nullopt; } LLVM_ATTRIBUTE_UNUSED static void HandleOverflow(size_t &Capacity, char *&Output, size_t &OutputLength, SmallVectorImpl &Result) { // No space left in output buffer. Double the size of the underlying // memory in the SmallVectorImpl, adjust pointer and length and continue // the conversion. Capacity = (Capacity < Result.max_size() / 2) ? 2 * Capacity : Result.max_size(); Result.resize(0); Result.resize_for_overwrite(Capacity); Output = static_cast(Result.data()); OutputLength = Capacity; } namespace { enum ConversionType { UTF8ToIBM1047, IBM1047ToUTF8, }; // Support conversion between EBCDIC 1047 and UTF-8. This class uses // built-in translation tables that allow for translation between the // aforementioned encodings. The use of tables for conversion is only // possible because EBCDIC 1047 is a single-byte, stateless encoding; other // encodings are not supported. class TextEncodingConverterTable final : public details::TextEncodingConverterImplBase { const ConversionType ConvType; public: TextEncodingConverterTable(ConversionType ConvType) : ConvType(ConvType) {} std::error_code convertString(StringRef Source, SmallVectorImpl &Result) override; void reset() override {} }; std::error_code TextEncodingConverterTable::convertString(StringRef Source, SmallVectorImpl &Result) { switch (ConvType) { case IBM1047ToUTF8: ConverterEBCDIC::convertToUTF8(Source, Result); return std::error_code(); case UTF8ToIBM1047: return ConverterEBCDIC::convertToEBCDIC(Source, Result); } llvm_unreachable("Invalid ConvType!"); return std::error_code(); } #if HAVE_ICU struct UConverterDeleter { void operator()(UConverter *Converter) const { if (Converter) ucnv_close(Converter); } }; using UConverterUniquePtr = std::unique_ptr; class TextEncodingConverterICU final : public details::TextEncodingConverterImplBase { UConverterUniquePtr FromConvDesc; UConverterUniquePtr ToConvDesc; public: TextEncodingConverterICU(UConverterUniquePtr FromConverter, UConverterUniquePtr ToConverter) : FromConvDesc(std::move(FromConverter)), ToConvDesc(std::move(ToConverter)) {} std::error_code convertString(StringRef Source, SmallVectorImpl &Result) override; void reset() override; }; // TODO: The current implementation discards the partial result and restarts the // conversion from the beginning if there is a conversion error due to // insufficient buffer size. In the future, it would better to save the partial // result and resume the conversion for the remaining string. // TODO: Improve translation of ICU errors to error_code std::error_code TextEncodingConverterICU::convertString(StringRef Source, SmallVectorImpl &Result) { // Setup the input in case it has no backing data. size_t InputLength = Source.size(); const char *In = InputLength ? const_cast(Source.data()) : ""; // Setup the output. We directly write into the SmallVector. size_t Capacity = Result.capacity(); size_t OutputLength = Capacity; Result.resize_for_overwrite(Capacity); char *Output; UErrorCode EC = U_ZERO_ERROR; ucnv_setToUCallBack(&*FromConvDesc, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &EC); ucnv_setFromUCallBack(&*ToConvDesc, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &EC); assert(U_SUCCESS(EC)); do { EC = U_ZERO_ERROR; const char *Input = In; Output = InputLength ? static_cast(Result.data()) : nullptr; ucnv_convertEx(&*ToConvDesc, &*FromConvDesc, &Output, Result.end(), &Input, In + InputLength, /*pivotStart=*/NULL, /*pivotSource=*/NULL, /*pivotTarget=*/NULL, /*pivotLimit=*/NULL, /*reset=*/true, /*flush=*/true, &EC); if (U_FAILURE(EC)) { if (EC == U_BUFFER_OVERFLOW_ERROR) { if (Capacity < Result.max_size()) { HandleOverflow(Capacity, Output, OutputLength, Result); continue; } else return std::error_code(E2BIG, std::generic_category()); } // Some other error occured. Result.resize(Output - Result.data()); return std::error_code(EILSEQ, std::generic_category()); } break; } while (true); Result.resize(Output - Result.data()); return std::error_code(); } void TextEncodingConverterICU::reset() { ucnv_reset(&*FromConvDesc); ucnv_reset(&*ToConvDesc); } #elif HAVE_ICONV class TextEncodingConverterIconv final : public details::TextEncodingConverterImplBase { class UniqueIconvT { iconv_t ConvDesc; public: operator iconv_t() const { return ConvDesc; } UniqueIconvT(iconv_t CD) : ConvDesc(CD) {} ~UniqueIconvT() { if (ConvDesc != (iconv_t)-1) { iconv_close(ConvDesc); ConvDesc = (iconv_t)-1; } } UniqueIconvT(UniqueIconvT &&Other) : ConvDesc(Other.ConvDesc) { Other.ConvDesc = (iconv_t)-1; } UniqueIconvT &operator=(UniqueIconvT &&Other) { if (&Other != this) { ConvDesc = Other.ConvDesc; Other.ConvDesc = (iconv_t)-1; } return *this; } }; UniqueIconvT ConvDesc; public: TextEncodingConverterIconv(UniqueIconvT ConvDesc) : ConvDesc(std::move(ConvDesc)) {} std::error_code convertString(StringRef Source, SmallVectorImpl &Result) override; void reset() override; }; // TODO: The current implementation discards the partial result and restarts the // conversion from the beginning if there is a conversion error due to // insufficient buffer size. In the future, it would better to save the partial // result and resume the conversion for the remaining string. std::error_code TextEncodingConverterIconv::convertString(StringRef Source, SmallVectorImpl &Result) { // Setup the output. We directly write into the SmallVector. size_t Capacity = Result.capacity(); char *Output = static_cast(Result.data()); size_t OutputLength = Capacity; Result.resize_for_overwrite(Capacity); size_t Ret; // Handle errors returned from iconv(). auto HandleError = [&Capacity, &Output, &OutputLength, &Result, this](size_t Ret) { if (Ret == static_cast(-1)) { // An error occured. Check if we can gracefully handle it. if (errno == E2BIG && Capacity < Result.max_size()) { HandleOverflow(Capacity, Output, OutputLength, Result); // Reset converter reset(); return std::error_code(); } else { // Some other error occured. Result.resize(Output - Result.data()); return std::error_code(errno, std::generic_category()); } } else { // A positive return value indicates that some characters were converted // in a nonreversible way, that is, replaced with a SUB symbol. Returning // an error in this case makes sure that both conversion routines behave // in the same way. return std::make_error_code(std::errc::illegal_byte_sequence); } }; do { // Setup the input. Use nullptr to reset iconv state if input length is // zero. size_t InputLength = Source.size(); char *Input = const_cast(InputLength ? Source.data() : ""); Ret = iconv(ConvDesc, &Input, &InputLength, &Output, &OutputLength); if (Ret != 0) { if (auto EC = HandleError(Ret)) return EC; continue; } // Flush the converter Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength); if (Ret != 0) { if (auto EC = HandleError(Ret)) return EC; continue; } break; } while (true); // Re-adjust size to actual size. Result.resize(Output - Result.data()); return std::error_code(); } inline void TextEncodingConverterIconv::reset() { iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr); } #endif // HAVE_ICONV } // namespace ErrorOr TextEncodingConverter::create(TextEncoding CPFrom, TextEncoding CPTo) { // Text encodings should be distinct. if (CPFrom == CPTo) return std::make_error_code(std::errc::invalid_argument); ConversionType Conversion; if (CPFrom == TextEncoding::UTF8 && CPTo == TextEncoding::IBM1047) Conversion = UTF8ToIBM1047; else if (CPFrom == TextEncoding::IBM1047 && CPTo == TextEncoding::UTF8) Conversion = IBM1047ToUTF8; else return std::make_error_code(std::errc::invalid_argument); return TextEncodingConverter( std::make_unique(Conversion)); } ErrorOr TextEncodingConverter::create(StringRef From, StringRef To) { std::optional FromEncoding = getKnownEncoding(From); std::optional ToEncoding = getKnownEncoding(To); if (FromEncoding && ToEncoding) { ErrorOr Converter = create(*FromEncoding, *ToEncoding); if (Converter) return Converter; } #if HAVE_ICU UErrorCode EC = U_ZERO_ERROR; UConverterUniquePtr FromConvDesc(ucnv_open(From.str().c_str(), &EC)); if (U_FAILURE(EC)) return std::make_error_code(std::errc::invalid_argument); UConverterUniquePtr ToConvDesc(ucnv_open(To.str().c_str(), &EC)); if (U_FAILURE(EC)) return std::make_error_code(std::errc::invalid_argument); auto Converter = std::make_unique( std::move(FromConvDesc), std::move(ToConvDesc)); return TextEncodingConverter(std::move(Converter)); #elif HAVE_ICONV iconv_t ConvDesc = iconv_open(To.str().c_str(), From.str().c_str()); if (ConvDesc == (iconv_t)-1) return std::make_error_code(std::errc::invalid_argument); return TextEncodingConverter( std::make_unique(ConvDesc)); #else return std::make_error_code(std::errc::invalid_argument); #endif }