diff options
author | Chris Lattner <sabre@nondot.org> | 2010-04-20 18:14:03 +0000 |
---|---|---|
committer | Chris Lattner <sabre@nondot.org> | 2010-04-20 18:14:03 +0000 |
commit | 8fbe98b3b6e44ca94c3edd630e39150228cc774e (patch) | |
tree | 3cd34de0021aae1c22b6dc54ad5d8c21e87b8472 /clang/lib/Basic/SourceManager.cpp | |
parent | 91baecfeb3c23d6ebd93583ced233aa070e6feea (diff) | |
download | llvm-8fbe98b3b6e44ca94c3edd630e39150228cc774e.zip llvm-8fbe98b3b6e44ca94c3edd630e39150228cc774e.tar.gz llvm-8fbe98b3b6e44ca94c3edd630e39150228cc774e.tar.bz2 |
enhance sourcemgr to detect various UTF BOM's and emit a fatal error
about it instead of producing tons of garbage from the lexer.
It would be even better for sourcemgr to dynamically transcode (e.g.
from UTF16 -> UTF8).
llvm-svn: 101924
Diffstat (limited to 'clang/lib/Basic/SourceManager.cpp')
-rw-r--r-- | clang/lib/Basic/SourceManager.cpp | 35 |
1 files changed, 35 insertions, 0 deletions
diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp index 053cfe3..c766241 100644 --- a/clang/lib/Basic/SourceManager.cpp +++ b/clang/lib/Basic/SourceManager.cpp @@ -119,6 +119,41 @@ const llvm::MemoryBuffer *ContentCache::getBuffer(Diagnostic &Diag, Buffer.setInt(true); #endif } + + // If the buffer is valid, check to see if it has a UTF Byte Order Mark + // (BOM). We only support UTF-8 without a BOM right now. See + // http://en.wikipedia.org/wiki/Byte_order_mark for more information. + if (!Buffer.getInt()) { + llvm::StringRef BufStr = Buffer.getPointer()->getBuffer(); + const char *BOM = 0; + if (BufStr.startswith("\xFE\xBB\xBF")) + BOM = "UTF-8"; + else if (BufStr.startswith("\xFE\xFF")) + BOM = "UTF-16 (BE)"; + else if (BufStr.startswith("\xFF\xFE")) + BOM = "UTF-16 (LE)"; + else if (BufStr.startswith(llvm::StringRef("\x00\x00\xFE\xFF", 4))) + BOM = "UTF-32 (BE)"; + else if (BufStr.startswith(llvm::StringRef("\xFF\xFE\x00\x00", 4))) + BOM = "UTF-32 (LE)"; + else if (BufStr.startswith("\x2B\x2F\x76")) + BOM = "UTF-7"; + else if (BufStr.startswith("\xF7\x64\x4C")) + BOM = "UTF-1"; + else if (BufStr.startswith("\xDD\x73\x66\x73")) + BOM = "UTF-EBCDIC"; + else if (BufStr.startswith("\x0E\xFE\xFF")) + BOM = "SDSU"; + else if (BufStr.startswith("\xFB\xEE\x28")) + BOM = "BOCU-1"; + else if (BufStr.startswith("\x84\x31\x95\x33")) + BOM = "BOCU-1"; + + if (BOM) { + Diag.Report(diag::err_unsupported_bom) << BOM << Entry->getName(); + Buffer.setInt(1); + } + } } if (Invalid) |