1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
|
From 0bafbd9c1273fab0dc79fd20db0ffc4443683f96 Mon Sep 17 00:00:00 2001
From: Ken Sharp <ken.sharp@artifex.com>
Date: Mon, 29 Apr 2019 11:14:06 +0100
Subject: [PATCH] PDF interpreter - Decode ToUnicode entries of the form
/Identity-H/V
Bug #701003 "Text searchability broken due to omission of /ToUnicode /Identity-H"
The PDF references from 1.2 too 2.0 all state that the value associated
with a ToUnicode key in a FontDescriptor must be a stream object. However
this file (and one case seen previously, bug 687351) have FontDescriptor
dictionaries where the value associated with a /ToUnicode key is a
name object, in both cases /Identity-H.
Although this is clearly not legal, Acrobat not only tolerates it, it
actually uses it for search/copy/paste (see bug 701003 for details).
Without the key Acrobat is unable to successfully search the output file.
We can't simply preserve the name object as a ToUnicode value; when
handling ToUnicode we actually decode the CMap and build a
GlyphNames2Unicode map (an internal representation of the G2U data
produced by the Microsoft PostScript printer driver). When writing the
output file we use that information to get a Unicode value for each
character we write, and build a new ToUnicode CMap using that.
This commit tackles the problem by pre-scanning for a name object and
then checking to see if its Identity-H or Identity-V (although we have
not seen an Identity-V, there seems no reason why it wouldn't be
equally valid). If we find either of these then we construct a
GlyphNames2Unicode table for all possible values (0 - 65535) and store
that with the font as normal. When we write the output file we only
write the required entries for the subset font, so we write a now
completely legal ToUnicode CMap, and Acrobat is equally happy with that
as the original name.
If the ToUnicode value isn't a name object, or isn't one of the
identities then we proceed as before. This means we will print a
warning for non conforming ToUnicode entries and ignore them.
---
Resource/Init/pdf_font.ps | 200 ++++++++++++++++++++++++--------------
1 file changed, 129 insertions(+), 71 deletions(-)
diff --git a/Resource/Init/pdf_font.ps b/Resource/Init/pdf_font.ps
index 0e802d393..964d54c1e 100644
--- a/Resource/Init/pdf_font.ps
+++ b/Resource/Init/pdf_font.ps
@@ -621,86 +621,144 @@ currentdict end readonly def
PDFDEBUG {
(.processToUnicode beg) =
} if
- 2 index /ToUnicode knownoget {
- dup type /dicttype eq { dup /File known not } { //true } ifelse {
- % We undefine wrong /Length and define /File in stream dictionaries.
- % Bug687351.pdf defines /ToUnicode /Identity-H, what is incorrect.
- ( **** Warning: Ignoring bad ToUnicode CMap.\n) pdfformatwarning
- pop
+
+ 2 index /ToUnicode knownoget
+ {
+ dup type /nametype eq {
+ % This is contrary to the specification but it seems that Acrobat at least will accept
+ % a ToUnicode with a value of Identity-H *and* will use that for search, copy/paste.
+ % We can't pass through a name, so the best we can do is build a GlyphNames2Unicode
+ % map matching that which would have been generated by a full 16-bit Identity CMap
+ %
+ % See bug numbers 701003 and 687351
+ %
+ dup /Identity-H eq 1 index /Identity-V eq or{
+ pop
+ 1 index /FontInfo .knownget not {
+ currentglobal 2 index dup gcheck setglobal
+ /FontInfo 5 dict dup 5 1 roll .forceput
+ setglobal
+ } if
+ dup /GlyphNames2Unicode .knownget not {
+ //true % No existing G2U, make one
+ } {
+ dup wcheck {
+ //false % Existing, writeable G2U, don't make new one
+ } {
+ pop //true % Existing read only G2U, make new one
+ } ifelse
+ } ifelse
+ {
+ currentglobal exch dup gcheck setglobal
+ dup /GlyphNames2Unicode 100 dict dup 4 1 roll .forceput
+ 3 2 roll setglobal
+ } if % font-res font-dict encoding|null font-info g2u
+
+ 0 1 65535{
+ % g2u index
+ dup dup 256 mod exch 256 idiv % g2u index lo-byte hi-byte
+ 2 string dup 0 4 -1 roll % g2u index lo-byte () () 0 hi-byte
+ put % g2u index lo-byte (x)
+ dup 1 % g2u index lo-byte (x) (x) 1
+ 4 -1 roll put % g2u index (x) (x) 1 lo-byte -> dict index (xx)
+ 2 index % g2u index (xx) dict
+ 3 1 roll % g2u g2u index (xx)
+ put % g2u
+ } for
+ pop % font-res font-dict encoding|null font-info
+ pop % font-res font-dict encoding|null
+ //false % We built a GlyphNames2Unicode table, don't need to process further
+ }{
+ //true % name is not Identity-V or H, fail by falling through
+ }ifelse
} {
- /PDFScanRules .getuserparam dup //null eq {
- pop //PDFScanRules_null
- } {
- 1 dict dup /PDFScanRules 4 -1 roll put
- } ifelse
- //PDFScanRules_true setuserparams
- PDFfile fileposition
- 3 -1 roll
- count 1 sub
- countdictstack
- { //false resolvestream
- % Following Acrobat we ignore everything outside
- % begincodespacerange .. endcmap.
- dup 0 (begincodespacerange) /SubFileDecode filter flushfile
- /CIDInit /ProcSet findresource begin
- //ToUnicodeCMapReader begin
- 12 dict begin
- /CMapType 2 def
- mark exch % emulate 'begincodespacerange'
- 0 (endcmap) /SubFileDecode filter cvx /begincmap cvx exch 2 .execn
- endcmap
- userdict /.lastToUnicode currentdict put
- end end end
- }
+ //true
+ } ifelse % not a name, try as a dictionary (as specified)
- PDFSTOPONERROR {
- { exec } 0 get
- //false
- 5 -2 roll
- 5
+ % If the ToUnicode isn't a name, or the name isn't Identity-V or -H then follow the specification
+ % If its not a dictionary type throw an error, otherwise decode it and build a GlyphNames2Unicode
+ %
+ {
+ dup type /dicttype eq { dup /File known not } { //true } ifelse {
+ % We undefine wrong /Length and define /File in stream dictionaries.
+ % Bug687351.pdf defines /ToUnicode /Identity-H, what is incorrect.
+ ( **** Warning: Ignoring bad ToUnicode CMap.\n) pdfformatwarning
+ pop
} {
- { stopped } 0 get
- 4 2 roll
- 4
- } ifelse
- array astore cvx exec
+ /PDFScanRules .getuserparam dup //null eq {
+ pop //PDFScanRules_null
+ } {
+ 1 dict dup /PDFScanRules 4 -1 roll put
+ } ifelse
+ //PDFScanRules_true setuserparams
+ PDFfile fileposition
+ 3 -1 roll
+ count 1 sub
+ countdictstack
+ { //false resolvestream
+ % Following Acrobat we ignore everything outside
+ % begincodespacerange .. endcmap.
+ dup 0 (begincodespacerange) /SubFileDecode filter flushfile
+ /CIDInit /ProcSet findresource begin
+ //ToUnicodeCMapReader begin
+ 12 dict begin
+ /CMapType 2 def
+ mark exch % emulate 'begincodespacerange'
+ 0 (endcmap) /SubFileDecode filter cvx /begincmap cvx exch 2 .execn
+ endcmap
+ userdict /.lastToUnicode currentdict put
+ end end end
+ }
- countdictstack exch sub 0 .max { end } repeat
- count exch sub 2 sub 0 .max { exch pop } repeat
- 3 1 roll % Stach the stop flag.
- PDFfile exch setfileposition
- setuserparams
- {
- ( **** Warning: Failed to read ToUnicode CMap.\n) pdfformatwarning
- } {
- 1 index /FontInfo .knownget not {
- currentglobal 2 index dup gcheck setglobal
- /FontInfo 5 dict dup 5 1 roll .forceput
- setglobal
- } if
- dup /GlyphNames2Unicode .knownget not {
- //true % No existing G2U, make one
+ PDFSTOPONERROR {
+ { exec } 0 get
+ //false
+ 5 -2 roll
+ 5
+ } {
+ { stopped } 0 get
+ 4 2 roll
+ 4
+ } ifelse
+ array astore cvx exec
+
+ countdictstack exch sub 0 .max { end } repeat
+ count exch sub 2 sub 0 .max { exch pop } repeat
+ 3 1 roll % Stach the stop flag.
+ PDFfile exch setfileposition
+ setuserparams
+ {
+ ( **** Warning: Failed to read ToUnicode CMap.\n) pdfformatwarning
} {
- dup wcheck {
- //false % Existing, writeable G2U, don't make new one
+ 1 index /FontInfo .knownget not {
+ currentglobal 2 index dup gcheck setglobal
+ /FontInfo 5 dict dup 5 1 roll .forceput
+ setglobal
+ } if
+ dup /GlyphNames2Unicode .knownget not {
+ //true % No existing G2U, make one
} {
- pop //true % Existing read only G2U, make new one
+ dup wcheck {
+ //false % Existing, writeable G2U, don't make new one
+ } {
+ pop //true % Existing read only G2U, make new one
+ } ifelse
} ifelse
+ {
+ currentglobal exch dup gcheck setglobal
+ dup /GlyphNames2Unicode 100 dict dup 4 1 roll .forceput
+ 3 2 roll setglobal
+ } if % font-res font-dict encoding|null font-info g2u
+ exch pop exch % font-res font-dict g2u encoding|null
+ userdict /.lastToUnicode get % font-res font-dict g2u Encoding|null CMap
+ .convert_ToUnicode-into-g2u % font-res font-dict
+ //null % font-res font-dict //null
} ifelse
- {
- currentglobal exch dup gcheck setglobal
- dup /GlyphNames2Unicode 100 dict dup 4 1 roll .forceput
- 3 2 roll setglobal
- } if % font-res font-dict encoding|null font-info g2u
- exch pop exch % font-res font-dict g2u encoding|null
- userdict /.lastToUnicode get % font-res font-dict g2u Encoding|null CMap
- .convert_ToUnicode-into-g2u % font-res font-dict
- //null % font-res font-dict //null
} ifelse
- } ifelse
- } if
- PDFDEBUG {
- (.processToUnicode end) =
+ } if
+ PDFDEBUG {
+ (.processToUnicode end) =
+ } if
} if
} if
} stopped
--
2.23.0
|