#!/usr/bin/env python3
#
# python script to process makedoc instructions in a source file and produce
# DocBook XML output
#
#
# This performs 3 stages of processing on it's input, in a similar fashion
# to makedoc:
#
# 1. Discard everything outside of /* */ comments
# 2. Identify lines which contains commands (a single uppercase word)
# 3. Apply each command to the text of the following lines (up to the next
# command or the end of the comment block), to produce some output
#
# The resulting output contains one or more DocBook XML refentry elements.
#
# To make the output a valid XML document which can be xincluded, those refentry
# elements are contained by a refcontainer element. refcontainer is not part of
# the DocBook DTD and should be removed by a suitable XSLT.
#
from __future__ import print_function
import fcntl
import sys
import os
import re
from optparse import OptionParser
import lxml.etree
import ply.lex as lex
import ply.yacc as yacc
rootelement = None # root element of the XML tree
refentry = None # the current refentry
verbose = 0
def dump(s, stage, threshold=1):
if verbose > threshold:
print('*' * 40, file=sys.stderr)
print(stage, file=sys.stderr)
print('*' * 40, file=sys.stderr)
print('%s' % s, file=sys.stderr)
print('*' * 40, file=sys.stderr)
#
# Stage 1
#
def skip_whitespace_and_stars(i, src):
while i < len(src) and (src[i].isspace() or (src[i] == '*' and src[i + 1] != '/')):
i += 1
return i
# Discard everything not inside '/* */' style-comments which start at column 0
# Discard any leading blank space or '*'
# Discard a single leading '.'
# Discard blank lines after a blank line
def comment_contents_generator(src):
i = 0
while i < len(src) - 2:
if src[i] == '\n' and src[i + 1] == '/' and src[i + 2] == '*':
i = i + 3
i = skip_whitespace_and_stars(i, src)
if src[i] == '.':
i += 1
while i < len(src):
if src[i] == '\n':
yield '\n'
i += 1
# allow a single blank line
if i < len(src) and src[i] == '\n':
yield '\n'
i += 1
i = skip_whitespace_and_stars(i, src)
elif src[i] == '*' and src[i + 1] == '/':
i = i + 2
# If we have just output \n\n, this adds another blank line.
# This is the only way a double blank line can occur.
yield '\nEND\n'
break
else:
yield src[i]
i += 1
else:
i += 1
def remove_noncomments(src):
src = '\n' + src
dst = ''.join(comment_contents_generator(src))
dump(dst, 'extracted from comments')
return dst
#
# Stage 2
#
# A command is a single word of at least 3 characters, all uppercase, and alone on a line
def iscommand(l):
if re.match(r'^[A-Z_]{3,}\s*$', l):
return True
return False
def command_block_generator(content):
command = 'START'
text = ''
for l in content.splitlines():
if iscommand(l):
yield (command, text)
command = l.rstrip()
text = ''
else:
text = text + l + '\n'
yield (command, text)
# Look for commands, which give instructions how to process the following input
def process(content):
content = content.lstrip()
dump(content, 'about to process for commands')
# process into a list of tuples of commands and the associated following text
# it is important to maintain the order of the sections the commands generate
processed = list(command_block_generator(content))
return processed
#
# Stage 3
#
# invoke each command on it's text
def perform(processed):
for i in processed:
c = i[0].rstrip()
t = i[1].strip() + '\n'
if verbose:
print("performing command '%s'" % c, file=sys.stderr)
if c in command_dispatch_dict:
command_dispatch_dict[c](c, t)
else:
print("command '%s' is not recognized" % c, file=sys.stderr)
# the text following an unrecognized command is discarded
# FUNCTION (aka TYPEDEF)
#
def function(c, l):
global refentry
global rootelement
l = l.strip()
if verbose:
print('FUNCTION %s' % l, file=sys.stderr)
separator = '---'
if ';' in l:
# fpclassify has an unusual format we also need to handle
spliton = ';'
l = l.splitlines()[0]
elif len(l.splitlines()) > 1:
# a few pages like mktemp have two '---' lines
spliton = ';'
o = ''
for i in l.splitlines():
if separator in i:
o += i + ';'
else:
o += i
l = o[:-1]
else:
spliton = '\n'
namelist = []
descrlist = []
for a in l.split(spliton):
(n, d) = a.split(separator, 1)
namelist = namelist + n.split(',')
descrlist = descrlist + [d]
# only copysign and log1p use <[ ]> markup in descr,
# only gets() uses << >> markup
# but we should handle it correctly
descr = line_markup_convert(', '.join(descrlist))
# fpclassify includes an 'and' we need to discard
namelist = map(lambda v: re.sub(r'^and ', r'', v.strip(), 1), namelist)
# strip off << >> surrounding name
namelist = map(lambda v: v.strip().lstrip('<').rstrip('>'), namelist)
# instantiate list to make it subscriptable
namelist = list(namelist)
if verbose:
print(namelist, file=sys.stderr)
# additional alternate names may also appear in INDEX commands
# create the root element if needed
if rootelement is None:
rootelement = lxml.etree.Element('refentrycontainer')
# FUNCTION implies starting a new refentry
if refentry is not None:
sys.exit("multiple FUNCTIONs without NEWPAGE")
# create the refentry
refentry = lxml.etree.SubElement(rootelement, 'refentry')
refentry.append(lxml.etree.Comment(' Generated by makedocbook.py '))
refentry.set('id', namelist[0].lstrip('_'))
refmeta = lxml.etree.SubElement(refentry, 'refmeta')
# refentrytitle will be same as refdescriptor, the primary name
refentrytitle = lxml.etree.SubElement(refmeta, 'refentrytitle')
refentrytitle.text = namelist[0]
manvolnum = lxml.etree.SubElement(refmeta, 'manvolnum')
manvolnum.text = '3'
refnamediv = lxml.etree.SubElement(refentry, 'refnamediv')
# refdescriptor is the primary name, assume we should use the one which
# appears first in the list
refdescriptor = lxml.etree.SubElement(refnamediv, 'refdescriptor')
refdescriptor.text = namelist[0]
# refname elements exist for all alternate names
for n in namelist:
refname = lxml.etree.SubElement(refnamediv, 'refname')
refname.text = n
refpurpose = lxml.etree.SubElement(refnamediv, 'refpurpose')
refnamediv.replace(refpurpose, lxml.etree.fromstring('' + descr + ''))
# Only FUNCTION currently exists, which implies that the SYNOPSIS should be
# a funcsynopsis. If TYPEDEF was to be added, SYNOPSIS should be processed
# in a different way, probably producing a refsynopsis.
# INDEX
# may occur more than once for each FUNCTION giving alternate names this
# function should be indexed under
#
def index(c, l):
l = l.strip()
if verbose:
print('INDEX %s' % l, file=sys.stderr)
# discard anything after the first word
l = l.split()[0]
# add indexterm
# (we could just index under all the refnames, but we control the indexing
# separately as that is what makedoc does)
indexterm = lxml.etree.SubElement(refentry, 'indexterm')
primary = lxml.etree.SubElement(indexterm, 'primary')
primary.text = l
# to validate, it seems we need to maintain refentry elements in a certain order
refentry[:] = sorted(refentry, key=lambda x: x.tag if isinstance(x.tag, str) else '')
# adds another alternate refname
refnamediv = refentry.find('refnamediv')
# as long as it doesn't already exist
if not refnamediv.xpath(('refname[.="%s"]') % l):
refname = lxml.etree.SubElement(refnamediv, 'refname')
refname.text = l
if verbose > 1:
print('added refname %s' % l, file=sys.stderr)
else:
if verbose > 1:
print('duplicate refname %s discarded' % l, file=sys.stderr)
# to validate, it seems we need to maintain refnamediv elements in a certain order
refnamediv[:] = sorted(refnamediv, key=lambda x: x.tag)
# SYNOPSIS aka ANSI_SYNOPSIS
# ANSI-style synopsis
#
# Note that makedoc would also process <> markup here, but there are no
# such uses.
#
def synopsis(c, t):
refsynopsisdiv = lxml.etree.SubElement(refentry, 'refsynopsisdiv')
funcsynopsis = lxml.etree.SubElement(refsynopsisdiv, 'funcsynopsis')
s = ''
for l in t.splitlines():
if re.match(r'\s*(#|\[|struct)', l):
# preprocessor # directives, structs, comments in square brackets
funcsynopsisinfo = lxml.etree.SubElement(funcsynopsis, 'funcsynopsisinfo')
funcsynopsisinfo.text = l.strip() + '\n'
elif re.match(r'[Ll]ink with', l):
pass
else:
s = s + l
# a prototype without a terminating ';' is an error
if s.endswith(')'):
sys.exit("'%s' missing terminating semicolon" % l)
s = s + ';'
if ';' in s:
synopsis_for_prototype(funcsynopsis, s)
s = ''
if s.strip():
sys.exit("surplus synopsis '%s'" % s)
def synopsis_for_prototype(funcsynopsis, s):
s = s.strip()
# funcsynopsis has a very detailed content model, so we need to massage the
# bare prototype into it. Fortunately, since the parameter names are marked
# up, we have enough information to do this.
for fp in s.split(';'):
fp = fp.strip()
if fp:
if verbose:
print("'%s'" % fp, file=sys.stderr)
match = re.match(r'(.*?)([\w\d]*) ?\((.*)\)', fp)
if verbose:
print(match.groups(), file=sys.stderr)
funcprototype = lxml.etree.SubElement(funcsynopsis, 'funcprototype')
funcdef = lxml.etree.SubElement(funcprototype, 'funcdef')
funcdef.text = match.group(1)
function = lxml.etree.SubElement(funcdef, 'function')
function.text = match.group(2)
if match.group(3).strip() == 'void':
void = lxml.etree.SubElement(funcprototype, 'void')
else:
# Split parameters on ',' except if it is inside ()
for p in re.split(r',(?![^()]*\))', match.group(3)):
p = p.strip()
if verbose:
print(p, file=sys.stderr)
if p == '...':
varargs = lxml.etree.SubElement(funcprototype, 'varargs')
else:
paramdef = lxml.etree.SubElement(funcprototype, 'paramdef')
parameter = lxml.etree.SubElement(paramdef, 'parameter')
# <[ ]> enclose the parameter name
match2 = re.match(r'(.*)<\[(.*)\]>(.*)', p)
if verbose:
print(match2.groups(), file=sys.stderr)
paramdef.text = match2.group(1)
parameter.text = match2.group(2)
parameter.tail = match2.group(3)
# DESCRIPTION
# (RETURNS, ERRORS, PORTABILITY, BUGS, WARNINGS, SEEALSO, NOTES are handled the same)
#
# Create a refsect with a title corresponding to the command
#
# Nearly all the the existing DESCRIPTION contents could be transformed into
# DocBook with a few regex substitutions. Unfortunately, pages like sprintf and
# sscanf, have very complex layout using nested tables and itemized lists, which
# it is best to parse in order to transform correctly.
#
def refsect(t, s):
refsect = lxml.etree.SubElement(refentry, 'refsect1')
title = lxml.etree.SubElement(refsect, 'title')
title.text = t.title()
if verbose:
print('%s has %d paragraphs' % (t, len(s.split('\n\n'))), file=sys.stderr)
if verbose > 1:
dump(s, 'before lexing')
# dump out lexer token sequence
lex.input(s)
for tok in lexer:
print(tok, file=sys.stderr)
# parse the section text for makedoc markup and the few pieces of texinfo
# markup we understand, and output an XML marked-up string
xml = parser.parse(s, tracking=True, debug=(verbose > 2))
dump(xml, 'after parsing')
xml = '' + xml + ''
refsect.extend(lxml.etree.fromstring(xml))
def seealso(c, t):
refsect('SEE ALSO', t)
# NEWPAGE
#
# start a new refentry
def newpage(c, t):
global refentry
refentry = None
# command dispatch table
def discarded(c, t):
return
command_dispatch_dict = {
'FUNCTION': function,
'TYPEDEF': function, # TYPEDEF is not currently used, but described in doc.str
'INDEX': index,
'TRAD_SYNOPSIS': discarded, # K&R-style synopsis, obsolete and discarded
'ANSI_SYNOPSIS': synopsis,
'SYNOPSIS': synopsis,
'DESCRIPTION': refsect,
'RETURNS': refsect,
'ERRORS': refsect,
'PORTABILITY': refsect,
'BUGS': refsect,
'WARNINGS': refsect,
'SEEALSO': seealso,
'NOTES': refsect, # NOTES is not described in doc.str, so is currently discarded by makedoc, but that doesn't seem right
'QUICKREF': discarded, # The intent of QUICKREF and MATHREF is not obvious, but they don't generate any output currently
'MATHREF': discarded,
'START': discarded, # a START command is inserted to contain the text before the first command
'END': discarded, # an END command is inserted merely to terminate the text for the last command in a comment block
'NEWPAGE': newpage,
}
#
# Utility functions
#
# apply transformations which are easy to do in-place
def line_markup_convert(p):
s = p
# escape characters not allowed in XML
s = s.replace('&', '&')
s = s.replace('<', '<')
s = s.replace('>', '>')
# convert <> to somecode
and <[var]> to
# var
# also handle nested << <[ ]> >> correctly
s = s.replace('<<', '')
s = s.replace('<[', '')
s = s.replace(']>', '')
s = s.replace('>>', '
')
# also convert some simple texinfo markup
# convert @emph{foo} to foo
s = re.sub(r'@emph{(.*?)}', r'\1', s)
# convert @strong{foo} to foo
s = re.sub(r'@strong{(.*?)}', r'\1', s)
# convert @minus{} to U+2212 MINUS SIGN
s = s.replace('@minus{}', '−')
# convert @dots{} to U+2026 HORIZONTAL ELLIPSIS
s = s.replace('@dots{}', '…')
# convert xref and pxref
s = re.sub(r'@xref{(.*?)}', r"See ", s)
# very hacky way of dealing with @* to force a newline
s = s.replace('@*', '')
# fail if there are unhandled texinfo commands
match = re.search(r'(? 3) and (s != p):
print('%s-> line_markup_convert ->\n%s' % (p, s), file=sys.stderr)
return s
#
# lexer
#
texinfo_commands = {
'ifnottex': 'IFNOTTEX',
'end ifnottex': 'ENDIFNOTTEX',
'tex': 'IFTEX',
'end tex': 'ENDIFTEX',
'comment': 'COMMENT',
'c ': 'COMMENT',
'multitable': 'MULTICOLUMNTABLE',
'end multitable': 'ENDMULTICOLUMNTABLE',
'headitem': 'MCT_HEADITEM',
'tab': 'MCT_COLUMN_SEPARATOR',
'item': 'MCT_ITEM',
}
# token names
tokens = [
'BLANKLINE',
'BULLETEND',
'BULLETSTART',
'COURIER',
'EOF',
'ITEM',
'TABLEEND',
'TABLESTART',
'TEXINFO',
'TEXT',
] + list(set(texinfo_commands.values()))
# regular expression rules for tokens, in priority order
# (all these expressions should match a whole line)
def t_TEXINFO(t):
# this matches any @command. but not @command{} which just happens to be at
# the start of a line
r'@\w+[^{]*?\n'
# if the line starts with a known texinfo command, change t.type to the
# token for that command
for k in texinfo_commands.keys():
if t.value[1:].startswith(k):
t.type = texinfo_commands[k]
break
return t
def t_COURIER(t):
r'[.|].*\n'
t.value = line_markup_convert(t.value[1:])
return t
def t_BULLETSTART(t):
r'O\+\n'
return t
def t_BULLETEND(t):
r'O-\n'
return t
def t_TABLESTART(t):
r'o\+\n'
return t
def t_TABLEEND(t):
r'o-\n'
return t
def t_ITEM(t):
r'o\s.*\n'
t.value = re.sub(r'o\s', r'', lexer.lexmatch.group(0), 1)
t.value = line_markup_convert(t.value)
return t
def t_TEXT(t):
r'.+\n'
t.value = line_markup_convert(t.value)
t.lexer.lineno += 1
return t
def t_BLANKLINE(t):
r'\n'
t.lexer.lineno += 1
return t
def t_eof(t):
if hasattr(t.lexer, 'at_eof'):
# remove eof flag ready for lexing next input
delattr(t.lexer, 'at_eof')
t.lexer.lineno = 0
return None
t.type = 'EOF'
t.lexer.at_eof = True
return t
# Error handling rule
def t_error(t):
sys.exit("tokenization error, remaining text '%s'" % t.value)
lexer = lex.lex()
#
# parser
#
def parser_verbose(p):
if verbose > 2:
print(p[0], file=sys.stderr)
def p_input(p):
'''input : paragraph
| input paragraph'''
if len(p) == 3:
p[0] = p[1] + '\n' + p[2]
else:
p[0] = p[1]
parser_verbose(p)
# Strictly, text at top level should be paragraphs (i.e terminated by a
# BLANKLINE), while text contained in rows or bullets may not be, but this
# grammar doesn't enforce that for simplicity's sake.
def p_paragraph(p):
'''paragraph : paragraph_content maybe_eof_or_blankline'''
p[0] = '\n' + p[1] + ''
parser_verbose(p)
def p_paragraph_content(p):
'''paragraph_content : paragraph_line
| paragraph_line paragraph_content'''
if len(p) == 3:
p[0] = p[1] + p[2]
else:
p[0] = p[1]
parser_verbose(p)
def p_paragraph_line(p):
'''paragraph_line : TEXT
| texinfocmd
| courierblock
| table
| bulletlist'''
p[0] = p[1]
def p_empty(p):
'empty :'
p[0] = ''
def p_maybe_eof_or_blankline(p):
'''maybe_eof_or_blankline : empty
| EOF
| BLANKLINE
| BLANKLINE EOF'''
p[0] = ''
def p_maybe_lines(p):
'''maybe_lines : empty
| paragraph maybe_lines'''
if len(p) == 3:
p[0] = p[1] + p[2]
else:
p[0] = p[1]
parser_verbose(p)
def p_maybe_blankline(p):
'''maybe_blankline : empty
| BLANKLINE'''
p[0] = ''
def p_courierblock(p):
'''courierblock : courier'''
p[0] = '' + p[1] + ''
parser_verbose(p)
def p_courier(p):
'''courier : COURIER
| COURIER courier'''
if len(p) == 3:
p[0] = p[1] + p[2]
else:
p[0] = p[1]
parser_verbose(p)
def p_bullet(p):
'''bullet : ITEM maybe_lines
| ITEM BLANKLINE maybe_lines'''
if len(p) == 3:
# Glue any text in ITEM into the first para of maybe_lines
# (This is an unfortunate consequence of the line-based tokenization we do)
if p[2].startswith(''):
p[0] = '' + p[1] + p[2][len(''):] + ''
else:
p[0] = '' + p[1] + '' + p[2] + ''
else:
p[0] = '' + p[1] + '' + p[3] + ''
parser_verbose(p)
def p_bullets(p):
'''bullets : bullet
| bullet bullets'''
if len(p) == 3:
p[0] = p[1] + '\n' + p[2]
else:
p[0] = p[1]
parser_verbose(p)
def p_bulletlist(p):
'''bulletlist : BULLETSTART bullets BULLETEND maybe_blankline'''
p[0] = '' + p[2] + ''
parser_verbose(p)
def p_row(p):
'''row : ITEM maybe_lines
| ITEM BLANKLINE maybe_lines'''
if len(p) == 3:
p[0] = '' + p[1] + '
' + p[2] + '
'
else:
p[0] = '' + p[1] + '
' + p[3] + '
'
parser_verbose(p)
def p_rows(p):
'''rows : row
| row rows'''
if len(p) == 3:
p[0] = p[1] + '\n' + p[2]
else:
p[0] = p[1]
parser_verbose(p)
def p_table(p):
'''table : TABLESTART rows TABLEEND maybe_blankline'''
p[0] = '' + p[2] + ''
parser_verbose(p)
def p_texinfocmd(p):
'''texinfocmd : unknown_texinfocmd
| comment
| multitable
| nottex
| tex'''
p[0] = p[1]
def p_unknown_texinfocmd(p):
'''unknown_texinfocmd : TEXINFO'''
print("unknown texinfo command '%s'" % p[1].strip(), file=sys.stderr)
p[0] = p[1]
parser_verbose(p)
def p_nottex(p):
'''nottex : IFNOTTEX paragraph_content ENDIFNOTTEX'''
p[0] = p[2]
def p_tex(p):
'''tex : IFTEX paragraph_content ENDIFTEX'''
# text for TeX formatter inside @iftex is discarded
p[0] = ''
def p_comment(p):
'''comment : COMMENT'''
# comment text is discarded
p[0] = ''
def p_mct_columns(p):
'''mct_columns : maybe_lines
| maybe_lines MCT_COLUMN_SEPARATOR mct_columns'''
if len(p) == 4:
p[0] = '' + p[1] + '' + p[3]
else:
p[0] = '' + p[1] + ''
parser_verbose(p)
def p_mct_row(p):
'''mct_row : MCT_ITEM mct_columns'''
p[0] = '' + p[2] + '
'
parser_verbose(p)
def p_mct_rows(p):
'''mct_rows : mct_row
| mct_row mct_rows'''
if len(p) == 3:
p[0] = p[1] + '\n' + p[2]
else:
p[0] = p[1]
parser_verbose(p)
def p_mct_header(p):
'''mct_header : MCT_HEADITEM mct_columns'''
p[0] = '' + p[2] + '
'
parser_verbose(p)
def p_multitable(p):
'''multitable : MULTICOLUMNTABLE mct_header mct_rows ENDMULTICOLUMNTABLE'''
# this doesn't handle the prototype row form of @multitable, only the @columnfractions form
colfrac = p[1].replace('@multitable @columnfractions', '').split()
colspec = '\n'.join(['' % (c) for c in colfrac])
header = '' + p[2] + '\n'
body = '' + p[3] + '\n'
p[0] = '' + colspec + header + body + ''
parser_verbose(p)
def p_error(t):
sys.exit('parse error at line %d, token %s, next token %s' % (t.lineno, t, parser.token()))
# protect creating the parser with a lockfile, so that when multiple processes
# are running this script simultaneously, we don't get one of them generating a
# parsetab.py file, while another one attempts to read it...
#
# see also https://github.com/dabeaz/ply/pull/184
with open(os.path.join(os.path.dirname(__file__), 'parsetab.lock'), 'w+') as lockfile:
fcntl.flock(lockfile.fileno(), fcntl.LOCK_EX)
parser = yacc.yacc(start='input')
fcntl.flock(lockfile.fileno(), fcntl.LOCK_UN)
#
#
#
def main(file):
content = file.read()
content = remove_noncomments(content)
processed = process(content)
perform(processed)
# output the XML tree
s = lxml.etree.tostring(rootelement, pretty_print=True, encoding='unicode')
if not s:
print('No output produced (perhaps the input has no makedoc markup?)', file=sys.stderr)
exit(1)
print(s)
#
#
#
if __name__ == '__main__':
options = OptionParser()
options.add_option('-v', '--verbose', action='count', dest='verbose', default=0)
(opts, args) = options.parse_args()
verbose = opts.verbose
if len(args) > 0:
main(open(args[0], 'rb'))
else:
main(sys.stdin)