PyPDF2 编码问题 PyPDF2.utils.PdfReadError Illegal character in Name Object
参考资料:https://github.com/mstamy2/PyPDF2/issues/438
使用 PyPDF2 做合并 PDF 文件时报错如下:
Traceback (most recent call last):
File "D:\projects\myproject\venv\lib\site-packages\PyPDF2\generic.py", line 484, in readFromStream
return NameObject(name.decode('utf-8'))
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xcb in position 8: invalid continuation byte
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:\projects\myproject\apps\backstage\views\busi_contract_manage_view.py", line 703, in post
merge_pdf_result = merge_pdf(final_files, pdf_path)
File "D:\projects\myproject\apps\utils\doc_convert_util.py", line 86, in merge_pdf
pdf_writer.write(new_file)
File "D:\projects\myproject\venv\lib\site-packages\PyPDF2\pdf.py", line 482, in write
self._sweepIndirectReferences(externalReferenceMap, self._root)
File "D:\projects\myproject\venv\lib\site-packages\PyPDF2\pdf.py", line 571, in _sweepIndirectReferences
self._sweepIndirectReferences(externMap, realdata)
File "D:\projects\myproject\venv\lib\site-packages\PyPDF2\pdf.py", line 547, in _sweepIndirectReferences
value = self._sweepIndirectReferences(externMap, value)
File "D:\projects\myproject\venv\lib\site-packages\PyPDF2\pdf.py", line 571, in _sweepIndirectReferences
self._sweepIndirectReferences(externMap, realdata)
File "D:\projects\myproject\venv\lib\site-packages\PyPDF2\pdf.py", line 547, in _sweepIndirectReferences
value = self._sweepIndirectReferences(externMap, value)
File "D:\projects\myproject\venv\lib\site-packages\PyPDF2\pdf.py", line 556, in _sweepIndirectReferences
value = self._sweepIndirectReferences(externMap, data[i])
File "D:\projects\myproject\venv\lib\site-packages\PyPDF2\pdf.py", line 571, in _sweepIndirectReferences
self._sweepIndirectReferences(externMap, realdata)
File "D:\projects\myproject\venv\lib\site-packages\PyPDF2\pdf.py", line 547, in _sweepIndirectReferences
value = self._sweepIndirectReferences(externMap, value)
File "D:\projects\myproject\venv\lib\site-packages\PyPDF2\pdf.py", line 547, in _sweepIndirectReferences
value = self._sweepIndirectReferences(externMap, value)
File "D:\projects\myproject\venv\lib\site-packages\PyPDF2\pdf.py", line 547, in _sweepIndirectReferences
value = self._sweepIndirectReferences(externMap, value)
File "D:\projects\myproject\venv\lib\site-packages\PyPDF2\pdf.py", line 577, in _sweepIndirectReferences
newobj = data.pdf.getObject(data)
File "D:\projects\myproject\venv\lib\site-packages\PyPDF2\pdf.py", line 1611, in getObject
retval = readObject(self.stream, self)
File "D:\projects\myproject\venv\lib\site-packages\PyPDF2\generic.py", line 66, in readObject
return DictionaryObject.readFromStream(stream, pdf)
File "D:\projects\myproject\venv\lib\site-packages\PyPDF2\generic.py", line 579, in readFromStream
value = readObject(stream, pdf)
File "D:\projects\myproject\venv\lib\site-packages\PyPDF2\generic.py", line 60, in readObject
return NameObject.readFromStream(stream, pdf)
File "D:\projects\myproject\venv\lib\site-packages\PyPDF2\generic.py", line 492, in readFromStream
raise utils.PdfReadError("Illegal character in Name Object")
PyPDF2.utils.PdfReadError: Illegal character in Name Object
找到对应的报错文件
File "D:\projects\myproject\venv\lib\site-packages\PyPDF2\generic.py", line 484
第484行 原代码:
try:
return NameObject(name.decode('utf-8'))
except (UnicodeEncodeError, UnicodeDecodeError) as e:
# Name objects should represent irregular characters
# with a '#' followed by the symbol's hex number
if not pdf.strict:
warnings.warn("Illegal character in Name Object", utils.PdfReadWarning)
return NameObject(name)
else:
raise utils.PdfReadError("Illegal character in Name Object")
在 except 中加入代码
return NameObject(name.decode('gbk'))
修改后
try:
return NameObject(name.decode('utf-8'))
except (UnicodeEncodeError, UnicodeDecodeError) as e:
try:
return NameObject(name.decode('gbk'))
except (UnicodeEncodeError, UnicodeDecodeError) as e:
# Name objects should represent irregular characters
# with a '#' followed by the symbol's hex number
if not pdf.strict:
warnings.warn("Illegal character in Name Object", utils.PdfReadWarning)
return NameObject(name)
else:
raise utils.PdfReadError("Illegal character in Name Object")
修改后仍会报错,需要修改修改另一处
Lib/site-packages/PyPDF2/utils.py 第238行
原代码
r = s.encode('latin-1')
if len(s) < 2:
bc[s] = r
return r
修改后代码:
try:
r = s.encode('latin-1')
except Exception as e:
r = s.encode('utf-8')
if len(s) < 2:
bc[s] = r
return r