debian-python-odf/contrib/syntaxhighlight/syntaxhighlight.py

509 lines
19 KiB
Python

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (C) 2007 Søren Roug, European Environment Agency
#
# This is free software. You may redistribute it under the terms
# of the Apache license and the GNU General Public License Version
# 2 or at your option any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public
# License along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#
# Contributor(s):
#
#
# Syntax Highlighting
# Originally from Peter Clive Wilkinson (http://www.petersblog.org/node/763)
#
import os, sys, re, getopt
from odf.opendocument import OpenDocumentText
from odf.style import FontFace, Style, TextProperties, ParagraphProperties
from odf.text import P, Span, S
class Highlight:
"""
Do syntax highlighting.
"""
courierfont = FontFace(name="Courier", fontfamily="Courier",
fontadornments="Normal", fontfamilygeneric="modern", fontpitch="fixed")
#--- Paragraph style --
programliststyle = Style(name="Program Listing", family="paragraph")
programliststyle.addElement(ParagraphProperties(border="0.002cm solid #000000", margin="0cm", padding="0.2cm"))
programliststyle.addElement(TextProperties(fontname="Courier", fontsize="9pt", language="none", country="none"))
#--- Text styles --
puncstyle = Style(name="Highlighted Punctuation", family="text")
puncstyle.addElement(TextProperties(fontweight="bold")) # Bold
numberstyle = Style(name="Highlighted Number", family="text")
numberstyle.addElement(TextProperties(color="#ff0000")) # Red
keywordstyle = Style(name="Highlighted Keyword", family="text")
keywordstyle.addElement(TextProperties(color="#b218b2", fontweight="bold")) # Blue, bold
variablestyle = Style(name="Highlighted Magic", family="text")
variablestyle.addElement(TextProperties(color="#0000ff")) # Blue
tagstyle = Style(name="Highlighted Tag", family="text")
tagstyle.addElement(TextProperties(color="#800000")) # Darkred
attrstyle = Style(name="Highlighted Tag", family="text")
attrstyle.addElement(TextProperties(color="#008000", fontweight="bold")) # Dark green bold
stringstyle = Style(name="Highlighted String", family="text")
stringstyle.addElement(TextProperties(color="#800000")) # Red
commentstyle = Style(name="Highlighted Comment", family="text")
commentstyle.addElement(TextProperties(color="#0000ff", fontstyle="italic")) # Blue, Italic
preprocstyle = Style(name="Highlighted Preprocessing", family="text")
preprocstyle.addElement(TextProperties(color="#ff00ff", fontstyle="italic")) # Magenta, Italic
def __init__(self, strMode):
"""
Initialise highlighter: strMode = language (PYTHON, C, CPP, PHP, HTML)
"""
self.textdoc = OpenDocumentText()
self.textdoc.fontfacedecls.addElement(self.courierfont)
self.textdoc.styles.addElement(self.programliststyle)
self.textdoc.styles.addElement(self.puncstyle)
self.textdoc.styles.addElement(self.numberstyle)
self.textdoc.styles.addElement(self.keywordstyle)
self.textdoc.styles.addElement(self.variablestyle)
self.textdoc.styles.addElement(self.tagstyle)
self.textdoc.styles.addElement(self.attrstyle)
self.textdoc.styles.addElement(self.stringstyle)
self.textdoc.styles.addElement(self.commentstyle)
self.textdoc.styles.addElement(self.preprocstyle)
self.strSpanStyle = None
self.currPara = P(stylename=self.programliststyle)
self.textdoc.text.addElement(self.currPara)
self.currSpan = None
if strMode == 'CPP':
strMode = 'C'
self.strSuppressTokens = []
elif strMode == 'C':
self.strSuppressTokens = ['CPPKEYWORD']
else:
self.strSuppressTokens = []
self.strMode = strMode
def PythonHighlightToken(self, strTok, oMatch, strStyle):
"""
Callback for python specific highlighting.
"""
#
# Input matches this type.
#
strValue = oMatch.group()
if strTok == 'MULTILINESTRING':
#
# If not inside a multiline string then start one now.
#
self.ChangeStyle(strStyle)
self.WriteContent(strValue)
#
# Remember you are in a string and remember how it was
# started (""" vs ''')
#
self.strMultilineString = oMatch.group(1)
return 'PythonMultilineString'
elif strTok == 'ENDMULTILINESTRING':
#
# Multiline Token found within a multiline string
#
if oMatch.group(1) == self.strMultilineString:
#
# Token is end of multiline so stop here.
#
self.WriteMultiline(strValue)
self.strMultilineString = ''
return 'PYTHON'
self.ChangeStyle(strStyle)
self.WriteContent(strValue)
def CHighlightToken(self, strTok, oMatch, strStyle):
"""
Callback for C specific highlighting.
"""
#
# Input matches this type.
#
strValue = oMatch.group()
#
# Not in multiline mode so change display style as appropriate
# and output the text.
#
self.ChangeStyle(strStyle)
self.WriteContent(strValue)
def PHPHighlightToken(self, strTok, oMatch, strStyle):
"""
Callback for PHP specific highlighting.
"""
#
# Input matches this type.
#
strValue = oMatch.group()
if strTok == 'MULTILINESTRING':
#
# If not inside a multiline string then start one now.
#
self.ChangeStyle(strStyle)
self.WriteContent(strValue)
#
# Remember you are in a string and remember how it was
# started (""" vs ''')
#
self.strMultilineString = oMatch.group(1)
return 'PHPMultilineString'
elif strTok == 'ENDMULTILINESTRING':
#
# Multiline Token found within a multiline string
#
if oMatch.group(1) == self.strMultilineString:
#
# Token is end of multiline so stop here.
#
self.WriteMultiline(strValue)
self.strMultilineString = ''
return 'PHP'
self.ChangeStyle(strStyle)
self.WriteContent(strValue)
if strTok == 'GOTOHTML':
#
# Embedded HTML
#
return 'HTML'
else:
return None
def HTMLHighlightToken(self, strTok, oMatch, strStyle):
"""
Callback for HTML specific highlighting.
"""
#
# Input matches this type.
#
strValue = oMatch.group()
self.ChangeStyle(strStyle)
self.WriteContent(strValue)
if strTok == 'TAG':
#
# Change to mode 1, 'within tag'.
#
return 'HTMLTag'
elif strTok == 'ENDTAG':
#
# Change to mode 1, 'within tag'.
#
return 'HTML'
elif strTok == 'GOTOPHP':
#
# Embedded PHP
#
return 'PHP'
else:
#
# No state change.
#
return None
oStyles = {
'PYTHON': ( PythonHighlightToken,
(
('PUNC', re.compile( r'[-+*!|&^~/%\=<>\[\]{}(),.:]'), puncstyle),
('NUMBER', re.compile( r'0x[0-9a-fA-F]+|[+-]?\d+(\.\d+)?([eE][+-]\d+)?|\d+'), numberstyle),
('KEYWORD', re.compile( r'(def|class|break|continue|del|exec|finally|pass|' +
r'print|raise|return|try|except|global|assert|lambda|' +
r'yield|for|while|if|elif|else|and|in|is|not|or|import|' +
r'from|True|False)(?![a-zA-Z0-9_])'), keywordstyle),
('MAGIC', re.compile( r'self|None'), variablestyle),
('MULTILINESTRING', re.compile( r'r?u?(\'\'\'|""")'), stringstyle),
('STRING', re.compile( r'r?u?\'(.*?)(?<!\\)\'|"(.*?)(?<!\\)"'), stringstyle),
('IDENTIFIER', re.compile( r'[a-zA-Z_][a-zA-Z0-9_]*'), None),
('COMMENT', re.compile( r'\#.*'), commentstyle),
('NEWLINE', re.compile( r'\r?\n'), 'NewPara'),
('WHITESPACE', re.compile( r'[ ]+'), 'Keep'),
# if all else fails...
('UNKNOWN', re.compile( r'.'), None)
)),
'PythonMultilineString': ( PythonHighlightToken,
(
('ENDMULTILINESTRING', re.compile( r'.*?("""|\'\'\')', re.DOTALL), stringstyle),
('UNKNOWN', re.compile( r'.'), 'Keep')
)),
'C': ( CHighlightToken,
(
('COMMENT', re.compile( r'//.*\r?\n'), commentstyle),
('MULTILINECOMMENT', re.compile( r'/\*.*?\*/', re.DOTALL), commentstyle),
('PREPROCESSOR', re.compile( r'\s*#.*?[^\\]\s*\n', re.DOTALL), preprocstyle),
('PUNC', re.compile( r'[-+*!&|^~/%\=<>\[\]{}(),.:]'), puncstyle),
('NUMBER', re.compile( r'0x[0-9a-fA-F]+|[+-]?\d+(\.\d+)?([eE][+-]\d+)?|\d+'),
numberstyle),
('KEYWORD', re.compile( r'(sizeof|int|long|short|char|void|' +
r'signed|unsigned|float|double|' +
r'goto|break|return|continue|asm|' +
r'case|default|if|else|switch|while|for|do|' +
r'struct|union|enum|typedef|' +
r'static|register|auto|volatile|extern|const)(?![a-zA-Z0-9_])'), keywordstyle),
( 'CPPKEYWORD', re.compile( r'(class|private|protected|public|template|new|delete|' +
r'this|friend|using|inline|export|bool|throw|try|catch|' +
r'operator|typeid|virtual)(?![a-zA-Z0-9_])'), keywordstyle),
('STRING', re.compile( r'r?u?\'(.*?)(?<!\\)\'|"(.*?)(?<!\\)"'), stringstyle),
('IDENTIFIER', re.compile( r'[a-zA-Z_][a-zA-Z0-9_]*'), None),
('NEWLINE', re.compile( r'\r?\n'), 'NewPara'),
('WHITESPACE', re.compile( r'[ ]+'), 'Keep'),
('UNKNOWN', re.compile( r'.'), None)
)),
'PHP': ( PHPHighlightToken,
(
('COMMENT', re.compile( r'//.*\r?\n'), commentstyle),
('MULTILINECOMMENT', re.compile( r'/\*.*?\*/', re.DOTALL), commentstyle),
('MULTILINESTRING', re.compile( r'<<<\s*([a-zA-Z0-9_]+)'), stringstyle),
('GOTOPHP', re.compile( r'<\?php'), stringstyle),
('PUNC', re.compile( r'[-+*!&|^~/%\=<>\[\]{}(),.:]'), puncstyle),
('NUMBER', re.compile( r'0x[0-9a-fA-F]+|[+-]?\d+(\.\d+)?([eE][+-]\d+)?|\d+'),
numberstyle),
('KEYWORD', re.compile( r'(declare|else|enddeclare|endswitch|elseif|endif|if|switch|' +
r'as|do|endfor|endforeach|endwhile|for|foreach|while|' +
r'case|default|switch|function|return|break|continue|exit|' +
r'var|const|boolean|bool|integer|int|real|double|float|string|' +
r'array|object|NULL|extends|implements|instanceof|parent|self|' +
r'include|require|include_once|require_once|new|true|false)(?![a-zA-Z0-9_])'), keywordstyle),
('STRING', re.compile( r'r?u?\'(.*?)(?<!\\)\'|"(.*?)(?<!\\)"'), stringstyle),
('VARIABLE', re.compile( r'\$[a-zA-Z_][a-zA-Z0-9_]*'), variablestyle),
('IDENTIFIER', re.compile( r'[a-zA-Z_][a-zA-Z0-9_]*'), None),
('WHITESPACE', re.compile( r'[ \r\n]+'), 'Keep'),
('GOTOHTML', re.compile( r'\?>'), stringstyle),
('UNKNOWN', re.compile( r'.'), None)
)),
'PHPMultilineString': ( PHPHighlightToken,
(
('ENDMULTILINESTRING', re.compile( r'.*?\n([a-zA-Z0-9_]+)', re.DOTALL), stringstyle),
('UNKNOWN', re.compile( r'.*?(?!\n)'), 'Keep')
)),
'HTML': ( HTMLHighlightToken,
# Mode 0: just look for tags
(
('COMMENT', re.compile( r'<!--[^>]*-->|<!>'), commentstyle),
('XMLCRAP', re.compile( r'<![^>]*>'), preprocstyle),
('SCRIPT', re.compile( r'<script .*?</script>', re.IGNORECASE + re.DOTALL), tagstyle),
('TAG', re.compile( r'</?\s*[a-zA-Z0-9]+'), tagstyle),
('GOTOPHP', re.compile( r'<\?php'), stringstyle),
('NEWLINE', re.compile( r'\r?\n'), 'NewPara'),
('UNKNOWN', re.compile( r'[^<]*'), None)
)),
# Mode 1: within tags,
'HTMLTag': ( HTMLHighlightToken,
(
('ENDTAG', re.compile( r'>'), tagstyle),
('ATTRIBUTE', re.compile( r'[a-zA-Z][a-zA-Z0-9:]*='), attrstyle),
('VALUE', re.compile( r'"[^"]*"'), stringstyle),
('NEWLINE', re.compile( r'\r?\n'), 'NewPara'),
('WHITESPACE', re.compile( r'[ \t\f\v]+'), None),
('UNKNOWN', re.compile( r'.'), None)
))
}
def generatedoc(self, strData):
"""
Syntax highlight some python code.
Returns html version of code.
"""
i = 0
strMode = self.strMode
#
# While input is not exhausted...
#
while i < len(strData):
#
# Compare current position with all possible display types.
#
try:
for strTok, oRE, strStyle in Highlight.oStyles[strMode][1]:
if not strTok in self.strSuppressTokens:
oMatch = oRE.match(strData, i)
if oMatch:
strNewMode = Highlight.oStyles[strMode][0](self, strTok, oMatch, strStyle)
if strNewMode != None:
strMode = strNewMode
i += len(oMatch.group())
break
else:
#
# Token not found so dump out raw text. This doesn't have to be bullet proof.
#
self.ChangeStyle(None)
self.WriteContent(strData[i])
i += 1
except:
raise
#
# Terminate any styles in use.
#
self.ChangeStyle(None)
#
# Expand tabs to 4 spaces.
# Doesn't matter if this number is wrong, the indentation will be butt ugly anyhow.
#
return self.textdoc
def WriteSingleline(self, parent, data):
ls = len(data)
cnt = 0
textstart = 0
i = -1
for i in xrange(ls):
if data[i] == ' ':
if cnt == 0:
# We found the first space. Now print the text before
parent.addText(data[textstart:i])
cnt = 0
textstart = i
cnt = cnt+1
else:
# We didn't see a space
# If there are unprinted spaces, print them now, if there are, then we're at text-start
if cnt > 0:
parent.addText(' ')
if cnt > 1:
parent.addElement(S(c=cnt-1))
if cnt > 0:
cnt = 0
textstart = i
if cnt > 0:
parent.addText(' ')
if cnt > 1:
parent.addElement(S(c=cnt-1))
elif i != -1:
parent.addText(data[textstart:i+1])
def WriteMultiline(self, data):
lines = data.split('\n')
self.currPara.addText(lines[0])
for line in lines[1:]:
self.currPara = P(stylename=self.programliststyle)
self.textdoc.text.addElement(self.currPara)
self.currSpan = Span(stylename=self.strSpanStyle)
self.WriteSingleline(self.currSpan, line)
self.currPara.addElement(self.currSpan)
def WriteContent(self, data):
"""
Write the content, but convert spaces to <text:s> first
"""
# re.compile( r'( )\1+(.+)')
if self.currSpan is None:
self.WriteSingleline(self.currPara, data)
else:
self.WriteSingleline(self.currSpan, data)
def ChangeStyle(self, strStyle):
"""
Generate output to change from existing style to another style only.
"""
#
# Output minimal formatting code: only output anything if the style has
# actually changed.
#
if self.strSpanStyle != strStyle:
if strStyle == 'NewPara':
self.currPara = P(stylename=self.programliststyle)
self.textdoc.text.addElement(self.currPara)
self.currSpan = None
self.strSpanStyle = None
elif strStyle != 'Keep':
if strStyle is None:
self.currSpan = None
else:
self.currSpan = Span(stylename=strStyle)
self.currPara.addElement(self.currSpan)
self.strSpanStyle = strStyle
def usage():
sys.stderr.write("Usage: %s [-l language] [-e encoding] inputfile outputfile\n" % sys.argv[0])
try:
opts, args = getopt.getopt(sys.argv[1:], "l:e:", ["language=", "encoding="])
except getopt.GetoptError:
usage()
sys.exit(2)
language = None
encoding = 'utf-8'
for o, a in opts:
if o in ("-l", "--language"):
language = a.upper()
if o in ("-e", "--encoding"):
encoding = a
if len(args) != 2:
usage()
sys.exit(2)
suffixes = {
'.py': 'PYTHON',
'.xhtml': 'HTML',
'.html': 'HTML',
'.htm': 'HTML',
'.c': 'C',
'.c++': 'CPP',
'.php': 'PHP'
}
inputfile = args[0]
outputfile = args[1]
if language is None:
try:
suffix = inputfile.lower().rindex('.')
language = suffixes[inputfile[suffix:]]
except:
usage()
sys.exit(2)
data = unicode(open(inputfile).read(),encoding)
Highlighted = Highlight(language).generatedoc(data)
Highlighted.save(args[1])