Taken from upstream PR#5 (https://github.com/asciidoc/asciidoc-py3/pull/5) 6469317 Remove unnecessary decode in a2x (Matthew Peveler) 684913e Fix decoding of file that specifies encoding in header tag in a2x (Matthew Peveler) 8369a97 re-add --nonet option (Matthew Peveler) diff --git c/a2x.py w/a2x.py index 55eb57e..c015079 100755 --- c/a2x.py +++ w/a2x.py @@ -254,15 +254,11 @@ def find_resources(files, tagname, attrname, filter=None): if OPTIONS.dry_run: continue parser = FindResources() - # HTMLParser has problems with non-ASCII strings. - # See http://bugs.python.org/issue3932 - contents = read_file(filename) - mo = re.search(r'\A<\?xml.* encoding="(.*?)"', contents) - if mo: - encoding = mo.group(1) - parser.feed(contents.decode(encoding)) - else: - parser.feed(contents) + with open(filename, 'rb') as open_file: + contents = open_file.read() + mo = re.search(b'\A<\?xml.* encoding="(.*?)"', contents) + contents = contents.decode(mo.group(1).decode('utf-8') if mo else 'utf-8') + parser.feed(contents) parser.close() result = list(set(result)) # Drop duplicate values. result.sort() @@ -337,7 +333,7 @@ def get_source_options(asciidoc_file): result = [] if os.path.isfile(asciidoc_file): options = '' - with open(asciidoc_file) as f: + with open(asciidoc_file, encoding='utf-8') as f: for line in f: mo = re.search(r'^//\s*a2x:', line) if mo: