diff options
Diffstat (limited to 'libgcobol/xmlparse.cc')
| -rw-r--r-- | libgcobol/xmlparse.cc | 592 |
1 files changed, 592 insertions, 0 deletions
diff --git a/libgcobol/xmlparse.cc b/libgcobol/xmlparse.cc new file mode 100644 index 0000000..69849e3 --- /dev/null +++ b/libgcobol/xmlparse.cc @@ -0,0 +1,592 @@ +/* + * Copyright (c) 2021-2025 Symas Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of the Symas Corporation nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <fcntl.h> +#include <unistd.h> + +#include <cctype> +#include <cerrno> +#include <cmath> +#include <cfenv> +#include <cstdio> +#include <cstdlib> +#include <cstring> +#include <ctime> + +#include <algorithm> +#include <vector> + +#include <libxml/SAX2.h> +#include <libxml/parser.h> + +#include "config.h" +#include "libgcobol-fp.h" +#include "ec.h" +#include "common-defs.h" +#include "io.h" +#include "gcobolio.h" +#include "libgcobol.h" + +#define COUNT_OF(X) (sizeof(X) / sizeof(X[0])) + +void sayso( const char func[], int line, + int len = 0 , const unsigned char data[] = { 0} ) { + if( getenv("XMLPARSE") ) { + switch(len) { + case 0: + fprintf(stderr, "%s:%d Kilroy was here\n", func, line); + break; + case -1: + fprintf(stderr, "%s:%d: '%s'\n", func, line, data); + break; + default: + fprintf(stderr, "%s:%d: '%.*s'\n", func, line, len, data); + break; + } + } +} +#define SAYSO() sayso(__func__, __LINE__) +#define SAYSO_DATAZ(S) sayso(__func__, __LINE__, -1, S) +#define SAYSO_DATA(N, S) sayso(__func__, __LINE__, N, S) + +struct xml_ec_value_t { + int ibm_code; + const char msg[80]; +} xml_ec_values[] = { + // Table 73. XML PARSE exceptions that allow continuation + { 1, "invalid character between elements" }, + { 2, "invalid start before element content" }, + { 3, "duplicate attribute" }, + { 4, "markup character '<' in an attribute value" }, + { 5, "start/end tag mismatch" }, + { 6, "invalid character in element" }, + { 7, "invalid start in element content. " }, + { 8, "CDATA closing character sequence ']]>' not opened" }, + { 10, "comment the character sequence '--' without '>'" }, + { 11, "invalid character in a processing instruction" }, + { 12, "XML declaration was not start of document" }, + { 13, "invalid digit in a hexadecimal character reference" }, + { 14, "invalid digit in a decimal character reference" }, + { 15, "encoding declaration value name must start with [a-zA-Z] character" }, + { 16, "character reference did not refer to a legal XML character" }, + { 17, "invalid character in an entity reference name" }, + { 70, "EBCDIC document, supported EBCDIC page, unsupported declaration" }, + { 71, "EBCDIC document, unsupported EBCDIC page " }, + { 72, "EBCDIC document, unsupported EBCDIC page, unsupported declaration" }, + { 73, "EBCDIC document, unsupported EBCDIC page and declaration " }, + { 80, "ASCII document, supported ASCII page, unsupported declaration" }, + { 81, "ASCII document, unsupported ASCII page " }, + { 82, "ASCII document, unsupported ASCII page, unsupported declaration" }, + { 83, "ASCII document, unsupported ASCII page and declaration " }, + { 84, "ASCII document, invalid UTF-8, external UTF-8, no declaration. " }, + { 85, "ASCII document, invalid UTF-8, external UTF-8, invalid declaration" }, + { 86, "ASCII document, invalid UTF-8, external ASCII" }, + { 87, "ASCII document, invalid UTF-8, external and document UTF-8" }, + { 88, "ASCII document, invalid UTF-8, unsupported ASCII/UTF-8, UTF-8 declaration" }, + { 89, "ASCII document, invalid UTF-8, external UTF-8, ASCII declaration" }, + { 92, "alphanumeric document expected, document is UTF-16. " }, + + // XML PARSE exceptions that allow continuation (continued) + //// 100,001 - 165,535 EBCDIC document encoding does not match code page + //// 200,001 - 265,535 ASCII document encoding does not match code page + + // XML PARSE exceptions that do not allow continuation + { 100, "end of document before start of XML declaration" }, + { 101, "end of document before end of XML declaration" }, + { 102, "end of document before root element" }, + { 103, "end of document before version information in XML declaration" }, + { 104, "end of document before version information value in XML declaration" }, + { 106, "end of document before encoding declaration value in XML declaration" }, + { 108, "end of document before standalone declaration value in XML declaration" }, + { 109, "end of document before attribute name" }, + { 110, "end of document before attribute value" }, + { 111, "end of document before character/entity reference in attribute value" }, + { 112, "end of document before empty element tag" }, + { 113, "end of document before root element name" }, + { 114, "end of document before element name" }, + { 115, "end of document before character data in element content" }, + { 116, "end of document before processing instruction in element content" }, + { 117, "end of document before comment or CDATA section in element content" }, + { 118, "end of document before comment in element content" }, + { 119, "end of document before CDATA section in element content" }, + { 120, "end of document before character/entity reference in element content" }, + { 121, "end of document before after close of root element" }, + { 122, "possible invalid start of a document type" }, + { 123, "duplicate document type" }, + { 124, "root element name must start with [A-Za-z_:]" }, + { 125, "first attribute name must start with [A-Za-z_:]" }, + { 126, "invalid character in or after element name" }, + { 127, "attribute name not followed by '=' " }, + { 128, "invalid attribute value delimiter" }, + { 130, "attribute name must start with [A-Za-z_:]" }, + { 131, "invalid character in or after attribute name" }, + { 132, "empty element tag not terminated with '/>'" }, + { 133, "element end tag name name must start with [A-Za-z_:]" }, + { 134, "element end tag not terminated with '>'" }, + { 135, "element name must start with [A-Za-z_:]" }, + { 136, "invalid start of comment/CDATA in element" }, + { 137, "invalid start of comment" }, + { 138, "processing instruction target name must start with [A-Za-z_:]" }, + { 139, "invalid character in/afterprocessing instruction target name" }, + { 140, "processing instruction not terminated with '?>'" }, + { 141, "invalid character following '&' in a character/entity reference" }, + { 142, "missing version information in XML declaration" }, + { 143, "missing '=' after 'version' in XML declaration " }, + { 144, "missing XML version declaration " }, + { 145, "invalid character in XML version information" }, + { 146, "invalid character following XML version information value " }, + { 147, "invalid attribute in XML declaration" }, + { 148, "missing '=' after 'encoding' in XML declaration" }, + { 149, "missing XML encoding declaration value" }, + { 150, "invalid XML encoding declaration value" }, + { 151, "invalid character afer XML declaration" }, + { 152, "invalid attribute XML declaration" }, + { 153, "missing '=' after standalone XML declaration" }, + { 154, "missing standalone XML declaration value" }, + { 155, "standalone declaration must be 'yes' or 'no'" }, + { 156, "invalid standalone XML declaration value" }, + { 157, "invalid character following XML standalone declaration value" }, + { 158, "unterminated XML declaration " }, + { 159, "start of document type declaration after end of root element" }, + { 160, "start of element after end of root element" }, + { 161, "invalid UTF-8 byte sequence" }, + { 162, "UTF-8 character that has a Unicode code point above x'FFFF'" }, + { 315, "UTF-16 document little-endian unsupported" }, + { 316, "UCS4 document unsupported" }, + { 317, "unrecognized document encoding" }, + { 318, "UTF-8 document unsupported " }, + { 320, "mismatched national document data item to document encoding EBCDIC" }, + { 321, "mismatched national document data item to document encoding ASCII" }, + { 322, "mismatched native alphanumeric document data item to document encoding EBCDIC" }, + { 323, "mismatched host alphanumeric document data item to document encoding ASCII" }, + { 324, "mismatched national document data item to document encoding UTF-8" }, + { 325, "mismatched host alphanumeric document datat to document encoding UTF-8" }, + { 500, "internal error" }, +}, *eoxml_ec_values = xml_ec_values + COUNT_OF(xml_ec_values); + +static const xml_ec_value_t * +xml_ec_value_of( int ibm_code ) { + if( 100000 < ibm_code && ibm_code < 200000 ) { + static xml_ec_value_t not_ebcdic{ 0, "EBCDIC document encoding " + "does not match code page" }; + not_ebcdic.ibm_code = ibm_code; + return ¬_ebcdic; + } + if( 200000 < ibm_code && ibm_code < 300000 ) { + static xml_ec_value_t not_ascii{ 0, "ASCII document encoding " + "does not match code page" }; + not_ascii.ibm_code = ibm_code; + return ¬_ascii; + } + auto p = std::find_if( xml_ec_values, eoxml_ec_values, + [ibm_code]( const auto& value ) { + return ibm_code == value.ibm_code; + } ); + return p < eoxml_ec_values ? &*p : nullptr; +} + +const char * +xml_ec_value_str( int ibm_code ) { + auto p = xml_ec_value_of(ibm_code); + return p? p->msg : nullptr; +} + +#if NEEDED +static bool +xml_fatal( int ibm_code ) { + if( ibm_code < 100 ) return false; + if( ibm_code > 100000 ) return false; + assert(ibm_code < 1000); + return true; +} +#endif + +static callback_t *cobol_callback; + +/* + * Internal handler functions + */ +/////////////// +/* + +ATTRIBUTE-CHARACTER The single character that corresponds with the predefined entity reference in the attribute value +ATTRIBUTE-CHARACTERS The value within quotation marks or apostrophes. This can be a substring of the attribute value if the value includes an entity reference. +ATTRIBUTE-NAME The attribute name; the string to the left of the equal sign +ATTRIBUTE-NATIONAL-CHARACTER Regardless of the type of the XML document specified by identifier-1 in the XML PARSE statement, XML-TEXT is empty with length zero and XML-NTEXT contains the single national character that corresponds with the numeric character reference. + +CONTENT-CHARACTER The single character that corresponds with the predefined entity reference in the element content + +CONTENT-NATIONAL-CHARACTER Regardless of the type of the XML document specified by identifier-1 in the XML PARSE statement, XML-TEXT is empty with length zero and XML-NTEXT contains the single national character that corresponds with the numeric character reference.1 +DOCUMENT-TYPE-DECLARATION The entire document type declaration, including the opening and closing character sequences "<!DOCTYPE" and ">" +ENCODING-DECLARATION The value, between quotes or apostrophes, of the encoding declaration in the XML declaration +END-OF-CDATA-SECTION The string "]]>" +END-OF-DOCUMENT Empty with length zero + +EXCEPTION The part of the document that was successfully scanned, up to and including the point at which the exception was detected.2 Special register XML-CODE contains the unique error code that identifies the exception. + +PROCESSING-INSTRUCTION-TARGET The processing instruction target name, which occurs immediately after the processing instruction opening sequence, "<?" +STANDALONE-DECLARATION The value, between quotation marks or apostrophes ("yes" or "no"), of the stand-alone declaration in the XML declaration +START-OF-CDATA-SECTION The string "<![CDATA[" +START-OF-DOCUMENT The entire document + +UNKNOWN-REFERENCE-IN-CONTENT The entity reference name, not including the "&" and ";" delimiters +UNKNOWN-REFERENCE-IN-ATTRIBUTE The entity reference name, not including the "&" and ";" delimiters +VERSION-INFORMATION The value, between quotation marks or apostrophes, of the version information in the XML declaration + +*/ +/////////////// + +extern cblc_field_t __ggsr__xml_event; +extern cblc_field_t __ggsr__xml_code; +extern cblc_field_t __ggsr__xml_text; +extern cblc_field_t __ggsr__xml_ntext; + +static void +xml_event( const char event_name[], size_t len, char text[] ) { + assert(strlen(event_name) < __ggsr__xml_event.allocated); + + auto pend = __ggsr__xml_event.data + __ggsr__xml_event.allocated; + auto p = std::copy( event_name, event_name + strlen(event_name), + PTRCAST(char, __ggsr__xml_event.data) ); + std::fill(PTRCAST(unsigned char, p), pend, 0x20); + + __ggsr__xml_text.data = reinterpret_cast<unsigned char*>(text); + __ggsr__xml_text.capacity = __ggsr__xml_text.allocated = len; + __ggsr__xml_code.data = 0; + cobol_callback(); +} + +static inline void +xml_event( const char event_name[], char text[] ) { + xml_event(event_name, strlen(text), text); +} + +static inline void +xml_event( const char event_name[], size_t len, const xmlChar * value ) { + char *text = reinterpret_cast<char*>(const_cast<xmlChar*>(value)); + xml_event(event_name, len, text); +} + +static inline void +xml_event( const char event_name[], const xmlChar * value ) { + char *text = reinterpret_cast<char*>(const_cast<xmlChar*>(value)); + xml_event(event_name, strlen(text), text); +} + +static void attributeDecl(void * ctx, + const xmlChar * elem, + const xmlChar * fullname, + int type, + int def, + const xmlChar * defaultValue, + xmlEnumerationPtr tree) +{ + fprintf(stderr, "%s:%d: elem=%s, name=%s, default=%s\n", + __func__, __LINE__, elem, fullname, defaultValue); +} + +static void cdataBlock(void * ctx, + const xmlChar * data, + int len) +{ + SAYSO_DATA(len, data); + xml_event("CONTENT-CHARACTERS", len, data); +} + +static void characters(void * ctx, + const xmlChar * data, + int len) +{ + SAYSO_DATA(len, data); + xml_event("CONTENT-CHARACTERS", len, data); +} + +static void comment(void * ctx, const xmlChar * value) { + SAYSO_DATAZ(value); + xml_event("COMMENT", value); +} + +static void elementDecl(void * ctx, + const xmlChar * name, + int type, + xmlElementContentPtr content) +{ SAYSO_DATAZ(name); } + +static void endDocument(void * ctx) +{ SAYSO(); } + +static void endElementNs(void * ctx, + const xmlChar * localname, + const xmlChar * prefix, + const xmlChar * URI) +{ + SAYSO_DATAZ(localname); + xml_event("END-OF-ELEMENT", localname); +} + +static void endElement(void * ctx, + const xmlChar * name) +{ SAYSO_DATAZ(name); } + +static void entityDecl(void * ctx, + const xmlChar * name, + int type, + const xmlChar * publicId, + const xmlChar * systemId, + xmlChar * content) +{ SAYSO_DATAZ(name); } + +static void error(void * ctx, const char * msg, ...) +{ + va_list ap; + va_start (ap, msg); + fprintf(stderr, "error: "); + vfprintf(stderr, msg, ap); + fprintf(stderr, "\n"); + va_end (ap); +} + +static void externalSubset(void * ctx, + const xmlChar * name, + const xmlChar * ExternalID, + const xmlChar * SystemID) +{ SAYSO_DATAZ(name); } + +static void fatalError(void * ctx, const char * msg, ...) +{ + va_list ap; + va_start (ap, msg); + fprintf(stderr, "fatal: "); + vfprintf(stderr, msg, ap); + fprintf(stderr, "\n"); + va_end (ap); +} + +static xmlEntityPtr getEntity(void * ctx, + const xmlChar * name) +{ SAYSO_DATAZ(name); } + +static xmlEntityPtr getParameterEntity(void * ctx, + const xmlChar * name) +{ SAYSO_DATAZ(name); } + +static int hasExternalSubset(void * ctx) +{ SAYSO(); } + +static int hasInternalSubset(void * ctx) +{ SAYSO(); } + +static void ignorableWhitespace(void * ctx, + const xmlChar * ch, + int len) +{ SAYSO_DATA(len, ch); } + +static void internalSubset(void * ctx, + const xmlChar * name, + const xmlChar * ExternalID, + const xmlChar * SystemID) +{ SAYSO_DATAZ(name); } + +static int isStandalone (void * ctx) +{ SAYSO(); } + + +static void notationDecl(void * ctx, + const xmlChar * name, + const xmlChar * publicId, + const xmlChar * systemId) +{ SAYSO_DATAZ(name); } + +static void processingInstruction(void * ctx, + const xmlChar * target, + const xmlChar * data) +{ + SAYSO_DATAZ(target); + xml_event("PROCESSING-INSTRUCTION-TARGET", target); + SAYSO_DATAZ(data); + xml_event("PROCESSING-INSTRUCTION-DATA", data); +} + +static void reference(void * ctx, + const xmlChar * name) +{ SAYSO_DATAZ(name); } + +static xmlParserInputPtr resolveEntity( void * ctx, + const xmlChar * publicId, + const xmlChar * systemId) +{ SAYSO(); } + +static void setDocumentLocator(void * ctx, + xmlSAXLocatorPtr loc) +{ SAYSO(); } + +/* + * Called after the XML declaration was parsed. + * Use xmlCtxtGetVersion(), xmlCtxtGetDeclaredEncoding() and + * xmlCtxtGetStandalone() to get data from the XML declaration. + */ +static void startDocument(void * ctx) +{ SAYSO(); } + +static void startElementNs(void * ctx, + const xmlChar * localname, + const xmlChar * prefix, + const xmlChar * URI, + int nb_namespaces, + const xmlChar ** namespaces, + int nb_attributes, + int nb_defaulted, + const xmlChar ** attributes) +{ + SAYSO_DATAZ(localname); + xml_event("START-OF-ELEMENT", localname); +} + +static void startElement(void * ctx, + const xmlChar * name, + const xmlChar ** atts) +{ SAYSO_DATAZ(name); } + +static void unparsedEntityDecl(void * ctx, + const xmlChar * name, + const xmlChar * publicId, + const xmlChar * systemId, + const xmlChar * notationName) +{ SAYSO_DATAZ(name); } + +static void warning(void * ctx, const char * msg, ... ) +{ + va_list ap; + va_start (ap, msg); + fprintf(stderr, "warning: "); + vfprintf(stderr, msg, ap); + fprintf(stderr, "\n"); + va_end (ap); +} + +/* + * xmlSAXHandler is a structure of function pointers that the SAX parser calls + * as it encounters XML elements in the input. Each pointer is a callback + * function, locally defined in this file. These we term "handlers". + * + * Each handler sets the XML registers per IBM, and then calls + * cobol_callback(), which is a function pointer supplied by the COBOL program + * to be the processing procedure for XML PARSE. + * + * There is no obvious way to abort parsing at the C level. See: + * http://veillard.com/XML/messages/0540.html + * + * > The simplest to implement this would not be to add a new SAX + * > callback but rather modify the xmlParserCtxtPtr passed to the + * > callbacks. The best seems to be: + * > - set ctxt->instate to XML_PARSER_EOF + * > - hack xmlCurrentChar() to return 0 + * > if (ctxt->instate == XML_PARSER_EOF) + * > Doing both should led to a quick termination of parsing + * > (but endElement(s)/endDocument will certainly be called anyway). + * + * Another hack might be to set the input to all blanks in cobol_callback. + */ + +static xmlSAXHandler handlers; + +static void +initialize_handlers( callback_t *callback ) { + handlers = xmlSAXHandler {}; + handlers.initialized = XML_SAX2_MAGIC; + + cobol_callback = callback; + +#if 0 + //// Should typically not be modified + handlers.attributeDecl = attributeDecl; + handlers.elementDecl = elementDecl; + handlers.entityDecl = entityDecl; + handlers.externalSubset = externalSubset; + handlers.getEntity = getEntity; + handlers.getParameterEntity = getParameterEntity; + handlers.internalSubset = internalSubset; + handlers.notationDecl = notationDecl; + handlers.resolveEntity = resolveEntity; + handlers.unparsedEntityDecl = unparsedEntityDecl; + + //// Not supposed to be changed by applications + handlers.hasExternalSubset = hasExternalSubset; + handlers.hasInternalSubset = hasInternalSubset; + handlers.isStandalone = isStandalone; + + //// SAX 1 only + handlers.startElement = startElement; + handlers.endElement = endElement; + + //// Everything is available on the context, so this is useless in our case + handlers.setDocumentLocator = setDocumentLocator; +#endif + + handlers.cdataBlock = cdataBlock; + handlers.characters = characters; + handlers.comment = comment; + handlers.endDocument = endDocument; + handlers.endElementNs = endElementNs; + handlers.ignorableWhitespace = ignorableWhitespace; + handlers.processingInstruction = processingInstruction; + handlers.reference = reference; + handlers.startDocument = startDocument; + handlers.startElementNs = startElementNs; + handlers.error = error; + handlers.fatalError = fatalError; + handlers.warning = warning; +} + +extern "C" +int +__gg__xml_parse( const cblc_field_t *input_field, + size_t input_offset, + size_t len, + cblc_field_t *encoding, + cblc_field_t *validating, + int returns_national, + void (*callback)(void) ) +{ + initialize_handlers(callback); + + const char *input = PTRCAST(char, input_field->data + input_offset); + + int erc = xmlSAXUserParseMemory(&handlers, nullptr, input, len); + + if( erc ) { + xmlErrorPtr msg = xmlCtxtGetLastError(nullptr); + fprintf(stderr, "XML PARSE: error: line %d: %s (%d: %d.%d.%d)\n", + msg->line, msg->message, erc, msg->domain, msg->level, msg->code); + } + return erc; +} + + |
