main/asciidoc/asciidoc-python3-a2x-decode-fix.patch


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41

Taken from upstream PR#5 (https://github.com/asciidoc/asciidoc-py3/pull/5)

6469317 Remove unnecessary decode in a2x (Matthew Peveler)
684913e Fix decoding of file that specifies encoding in header tag in a2x (Matthew Peveler)
8369a97 re-add --nonet option (Matthew Peveler)

diff --git c/a2x.py w/a2x.py
index 55eb57e..c015079 100755
--- c/a2x.py
+++ w/a2x.py
@@ -254,15 +254,11 @@ def find_resources(files, tagname, attrname, filter=None):
         if OPTIONS.dry_run:
             continue
         parser = FindResources()
-        # HTMLParser has problems with non-ASCII strings.
-        # See http://bugs.python.org/issue3932
-        contents = read_file(filename)
-        mo = re.search(r'\A<\?xml.* encoding="(.*?)"', contents)
-        if mo:
-            encoding = mo.group(1)
-            parser.feed(contents.decode(encoding))
-        else:
-            parser.feed(contents)
+        with open(filename, 'rb') as open_file:
+            contents = open_file.read()
+        mo = re.search(b'\A<\?xml.* encoding="(.*?)"', contents)
+        contents = contents.decode(mo.group(1).decode('utf-8') if mo else 'utf-8')
+        parser.feed(contents)
         parser.close()
     result = list(set(result))   # Drop duplicate values.
     result.sort()
@@ -337,7 +333,7 @@ def get_source_options(asciidoc_file):
     result = []
     if os.path.isfile(asciidoc_file):
         options = ''
-        with open(asciidoc_file) as f:
+        with open(asciidoc_file, encoding='utf-8') as f:
             for line in f:
                 mo = re.search(r'^//\s*a2x:', line)
                 if mo: