From d7f5eafddb46106850c036d118d33f0416ef3441 Mon Sep 17 00:00:00 2001 From: oscardssmith Date: Mon, 22 Aug 2016 15:23:34 -0400 Subject: [PATCH 1/2] speed up escape sequences Changes readStringFromStream to use a dict of escapes rather than a long if/else chain. (should lead to speed up, and looks cleaner) --- PyPDF2/generic.py | 99 ++++++++++++++++++++++++----------------------- 1 file changed, 50 insertions(+), 49 deletions(-) diff --git a/PyPDF2/generic.py b/PyPDF2/generic.py index 10e1a35..2f93d3c 100644 --- a/PyPDF2/generic.py +++ b/PyPDF2/generic.py @@ -340,55 +340,56 @@ def readStringFromStream(stream): break elif tok == b_("\\"): tok = stream.read(1) - if tok == b_("n"): - tok = b_("\n") - elif tok == b_("r"): - tok = b_("\r") - elif tok == b_("t"): - tok = b_("\t") - elif tok == b_("b"): - tok = b_("\b") - elif tok == b_("f"): - tok = b_("\f") - elif tok == b_("c"): - tok = b_("\c") - elif tok == b_("("): - tok = b_("(") - elif tok == b_(")"): - tok = b_(")") - elif tok == b_("/"): - tok = b_("/") - elif tok == b_("\\"): - tok = b_("\\") - elif tok in (b_(" "), b_("/"), b_("%"), b_("<"), b_(">"), b_("["), - b_("]"), b_("#"), b_("_"), b_("&"), b_('$')): - # odd/unnessecary escape sequences we have encountered - tok = b_(tok) - elif tok.isdigit(): - # "The number ddd may consist of one, two, or three - # octal digits; high-order overflow shall be ignored. - # Three octal digits shall be used, with leading zeros - # as needed, if the next character of the string is also - # a digit." (PDF reference 7.3.4.2, p 16) - for i in range(2): - ntok = stream.read(1) - if ntok.isdigit(): - tok += ntok - else: - break - tok = b_(chr(int(tok, base=8))) - elif tok in b_("\n\r"): - # This case is hit when a backslash followed by a line - # break occurs. If it's a multi-char EOL, consume the - # second character: - tok = stream.read(1) - if not tok in b_("\n\r"): - stream.seek(-1, 1) - # Then don't add anything to the actual string, since this - # line break was escaped: - tok = b_('') - else: - raise utils.PdfReadError(r"Unexpected escaped string: %s" % tok) + ESCAPE_DICT = {b_("n") : b_("\n"), + b_("r") : b_("\r"), + b_("t") : b_("\t"), + b_("b") : b_("\b"), + b_("f") : b_("\f"), + b_("c") : b_("\c"), + b_("(") : b_("("), + b_(")") : b_(")"), + b_("/") : b_("/"), + b_("\\") : b_("\\"), + b_(" ") : b_(" "), + b_("/") : b_("/"), + b_("%") : b_("%"), + b_("<") : b_("<"), + b_(">") : b_(">"), + b_("[") : b_("["), + b_("]") : b_("]"), + b_("#") : b_("#"), + b_("_") : b_("_"), + b_("&") : b_("&"), + b_('$') : b_('$'), + } + try: + tok = escape_dict[tok] + except KeyError: + if tok.isdigit(): + # "The number ddd may consist of one, two, or three + # octal digits; high-order overflow shall be ignored. + # Three octal digits shall be used, with leading zeros + # as needed, if the next character of the string is also + # a digit." (PDF reference 7.3.4.2, p 16) + for i in range(2): + ntok = stream.read(1) + if ntok.isdigit(): + tok += ntok + else: + break + tok = b_(chr(int(tok, base=8))) + elif tok in b_("\n\r"): + # This case is hit when a backslash followed by a line + # break occurs. If it's a multi-char EOL, consume the + # second character: + tok = stream.read(1) + if not tok in b_("\n\r"): + stream.seek(-1, 1) + # Then don't add anything to the actual string, since this + # line break was escaped: + tok = b_('') + else: + raise utils.PdfReadError(r"Unexpected escaped string: %s" % tok) txt += tok return createStringObject(txt) From 77629e6266709b65cce24d6346b8be0bfa29cf90 Mon Sep 17 00:00:00 2001 From: Matthew Stamy Date: Tue, 1 Nov 2016 12:23:59 -0500 Subject: [PATCH 2/2] Correct name error --- PyPDF2/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyPDF2/generic.py b/PyPDF2/generic.py index 2f93d3c..959957d 100644 --- a/PyPDF2/generic.py +++ b/PyPDF2/generic.py @@ -363,7 +363,7 @@ def readStringFromStream(stream): b_('$') : b_('$'), } try: - tok = escape_dict[tok] + tok = ESCAPE_DICT[tok] except KeyError: if tok.isdigit(): # "The number ddd may consist of one, two, or three