View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.imaging.formats.jpeg.iptc;
19  
20  import static org.apache.commons.imaging.common.BinaryFunctions.read2Bytes;
21  import static org.apache.commons.imaging.common.BinaryFunctions.read4Bytes;
22  import static org.apache.commons.imaging.common.BinaryFunctions.readByte;
23  import static org.apache.commons.imaging.common.BinaryFunctions.readBytes;
24  import static org.apache.commons.imaging.common.BinaryFunctions.slice;
25  import static org.apache.commons.imaging.common.BinaryFunctions.startsWith;
26  
27  import java.io.ByteArrayInputStream;
28  import java.io.ByteArrayOutputStream;
29  import java.io.IOException;
30  import java.io.InputStream;
31  import java.nio.ByteOrder;
32  import java.nio.charset.Charset;
33  import java.nio.charset.StandardCharsets;
34  import java.util.ArrayList;
35  import java.util.Arrays;
36  import java.util.Comparator;
37  import java.util.List;
38  import java.util.Objects;
39  import java.util.logging.Level;
40  import java.util.logging.Logger;
41  
42  import org.apache.commons.imaging.ImagingConstants;
43  import org.apache.commons.imaging.ImagingException;
44  import org.apache.commons.imaging.ImagingParameters;
45  import org.apache.commons.imaging.common.Allocator;
46  import org.apache.commons.imaging.common.BinaryFileParser;
47  import org.apache.commons.imaging.common.BinaryFunctions;
48  import org.apache.commons.imaging.common.BinaryOutputStream;
49  import org.apache.commons.imaging.common.ByteConversions;
50  import org.apache.commons.imaging.formats.jpeg.JpegConstants;
51  import org.apache.commons.imaging.formats.jpeg.JpegImagingParameters;
52  import org.apache.commons.imaging.internal.Debug;
53  
54  public class IptcParser extends BinaryFileParser {
55  
56      private static final Logger LOGGER = Logger.getLogger(IptcParser.class.getName());
57  
58      private static final ByteOrder APP13_BYTE_ORDER = ByteOrder.BIG_ENDIAN;
59  
60      /**
61       * Block types (or Image Resource IDs) that are not recommended to be interpreted when libraries process Photoshop IPTC metadata.
62       *
63       * @see <a href="https://www.adobe.com/devnet-apps/photoshop/fileformatashtml/">Adobe Photoshop File Formats Specification</a>
64       * @see <a href="https://issues.apache.org/jira/browse/IMAGING-246">IMAGING-246</a>
65       * @since 1.0-alpha2
66       */
67      private static final List<Integer> PHOTOSHOP_IGNORED_BLOCK_TYPE = Arrays.asList(1084, 1085, 1086, 1087);
68  
69      private static final Charset DEFAULT_CHARSET = StandardCharsets.ISO_8859_1;
70      private static final int ENV_TAG_CODED_CHARACTER_SET = 90;
71      private static final byte[] CHARACTER_ESCAPE_SEQUENCE = { '\u001B', '%', 'G' };
72  
73      public IptcParser() {
74          super(ByteOrder.BIG_ENDIAN);
75      }
76  
77      private Charset findCharset(final byte[] codedCharset) {
78          final String codedCharsetString = new String(codedCharset, StandardCharsets.ISO_8859_1);
79          try {
80              if (Charset.isSupported(codedCharsetString)) {
81                  return Charset.forName(codedCharsetString);
82              }
83          } catch (final IllegalArgumentException ignored) {
84              // ignored
85          }
86          // check if encoding is a escape sequence
87          // normalize encoding byte sequence
88          final byte[] codedCharsetNormalized = Allocator.byteArray(codedCharset.length);
89          int j = 0;
90          for (final byte element : codedCharset) {
91              if (element != ' ') {
92                  codedCharsetNormalized[j++] = element;
93              }
94          }
95  
96          if (Objects.deepEquals(codedCharsetNormalized, CHARACTER_ESCAPE_SEQUENCE)) {
97              return StandardCharsets.UTF_8;
98          }
99          return DEFAULT_CHARSET;
100     }
101 
102     public boolean isPhotoshopJpegSegment(final byte[] segmentData) {
103         if (!startsWith(segmentData, JpegConstants.PHOTOSHOP_IDENTIFICATION_STRING)) {
104             return false;
105         }
106 
107         final int index = JpegConstants.PHOTOSHOP_IDENTIFICATION_STRING.size();
108         return index + 4 <= segmentData.length && ByteConversions.toInt(segmentData, index, APP13_BYTE_ORDER) == JpegConstants.CONST_8BIM;
109     }
110 
111     protected List<IptcBlock> parseAllBlocks(final byte[] bytes, final boolean strict) throws ImagingException, IOException {
112         final List<IptcBlock> blocks = new ArrayList<>();
113 
114         try (InputStream bis = new ByteArrayInputStream(bytes)) {
115 
116             // Note that these are unsigned quantities. Name is always an even
117             // number of bytes (including the 1st byte, which is the size.)
118 
119             final byte[] idString = readBytes("", bis, JpegConstants.PHOTOSHOP_IDENTIFICATION_STRING.size(), "App13 Segment missing identification string");
120             if (!JpegConstants.PHOTOSHOP_IDENTIFICATION_STRING.equals(idString)) {
121                 throw new ImagingException("Not a Photoshop App13 Segment");
122             }
123 
124             // int index = PHOTOSHOP_IDENTIFICATION_STRING.length;
125 
126             while (true) {
127                 final int imageResourceBlockSignature;
128                 try {
129                     imageResourceBlockSignature = read4Bytes("", bis, "Image Resource Block missing identification string", APP13_BYTE_ORDER);
130                 } catch (final IOException ioEx) {
131                     break;
132                 }
133                 if (imageResourceBlockSignature != JpegConstants.CONST_8BIM) {
134                     throw new ImagingException("Invalid Image Resource Block Signature");
135                 }
136 
137                 final int blockType = read2Bytes("", bis, "Image Resource Block missing type", APP13_BYTE_ORDER);
138                 Debug.debug("blockType: " + blockType + " (0x" + Integer.toHexString(blockType) + ")");
139 
140                 // skip blocks that the photoshop spec recommends to, see IMAGING-246
141                 if (PHOTOSHOP_IGNORED_BLOCK_TYPE.contains(blockType)) {
142                     Debug.debug("Skipping blockType: " + blockType + " (0x" + Integer.toHexString(blockType) + ")");
143                     // if there is still data in this block, before the next image resource block
144                     // (8BIM), then we must consume these bytes to leave a pointer ready to read
145                     // the next block
146                     BinaryFunctions.searchQuad(JpegConstants.CONST_8BIM, bis);
147                     continue;
148                 }
149 
150                 final int blockNameLength = readByte("Name length", bis, "Image Resource Block missing name length");
151                 if (blockNameLength > 0) {
152                     Debug.debug("blockNameLength: " + blockNameLength + " (0x" + Integer.toHexString(blockNameLength) + ")");
153                 }
154                 byte[] blockNameBytes;
155                 if (blockNameLength == 0) {
156                     readByte("Block name bytes", bis, "Image Resource Block has invalid name");
157                     blockNameBytes = ImagingConstants.EMPTY_BYTE_ARRAY;
158                 } else {
159                     try {
160                         blockNameBytes = readBytes("", bis, blockNameLength, "Invalid Image Resource Block name");
161                     } catch (final IOException ioEx) {
162                         if (strict) {
163                             throw ioEx;
164                         }
165                         break;
166                     }
167 
168                     if (blockNameLength % 2 == 0) {
169                         readByte("Padding byte", bis, "Image Resource Block missing padding byte");
170                     }
171                 }
172 
173                 final int blockSize = read4Bytes("", bis, "Image Resource Block missing size", APP13_BYTE_ORDER);
174                 Debug.debug("blockSize: " + blockSize + " (0x" + Integer.toHexString(blockSize) + ")");
175 
176                 /*
177                  * doesn't catch cases where blocksize is invalid but is still less than bytes.length but will at least prevent OutOfMemory errors
178                  */
179                 if (blockSize > bytes.length) {
180                     throw new ImagingException("Invalid Block Size : " + blockSize + " > " + bytes.length);
181                 }
182 
183                 final byte[] blockData;
184                 try {
185                     blockData = readBytes("", bis, blockSize, "Invalid Image Resource Block data");
186                 } catch (final IOException ioEx) {
187                     if (strict) {
188                         throw ioEx;
189                     }
190                     break;
191                 }
192 
193                 blocks.add(new IptcBlock(blockType, blockNameBytes, blockData));
194 
195                 if (blockSize % 2 != 0) {
196                     readByte("Padding byte", bis, "Image Resource Block missing padding byte");
197                 }
198             }
199 
200             return blocks;
201         }
202     }
203 
204     protected List<IptcRecord> parseIptcBlock(final byte[] bytes) {
205         Charset charset = DEFAULT_CHARSET;
206         final List<IptcRecord> elements = new ArrayList<>();
207 
208         int index = 0;
209         // Integer recordVersion = null;
210         while (index + 1 < bytes.length) {
211             final int tagMarker = 0xff & bytes[index++];
212             Debug.debug("tagMarker: " + tagMarker + " (0x" + Integer.toHexString(tagMarker) + ")");
213 
214             if (tagMarker != IptcConstants.IPTC_RECORD_TAG_MARKER) {
215                 if (LOGGER.isLoggable(Level.FINE)) {
216                     LOGGER.fine("Unexpected record tag marker in IPTC data.");
217                 }
218                 return elements;
219             }
220 
221             final int recordNumber = 0xff & bytes[index++];
222             Debug.debug("recordNumber: " + recordNumber + " (0x" + Integer.toHexString(recordNumber) + ")");
223 
224             // int recordPrefix = convertByteArrayToShort("recordPrefix", index,
225             // bytes);
226             // if (verbose)
227             // Debug.debug("recordPrefix", recordPrefix + " (0x"
228             // + Integer.toHexString(recordPrefix) + ")");
229             // index += 2;
230             //
231             // if (recordPrefix != IPTC_RECORD_PREFIX)
232             // {
233             // if (verbose)
234             // System.out
235             // .println("Unexpected record prefix in IPTC data!");
236             // return elements;
237             // }
238 
239             // throw new ImageReadException(
240             // "Unexpected record prefix in IPTC data.");
241 
242             final int recordType = 0xff & bytes[index];
243             Debug.debug("recordType: " + recordType + " (0x" + Integer.toHexString(recordType) + ")");
244             index++;
245 
246             final int recordSize = ByteConversions.toUInt16(bytes, index, getByteOrder());
247             index += 2;
248 
249             final boolean extendedDataset = recordSize > IptcConstants.IPTC_NON_EXTENDED_RECORD_MAXIMUM_SIZE;
250             final int dataFieldCountLength = recordSize & 0x7fff;
251             if (extendedDataset) {
252                 Debug.debug("extendedDataset. dataFieldCountLength: " + dataFieldCountLength);
253             }
254             if (extendedDataset) {
255                 // ignore extended dataset and everything after.
256                 return elements;
257             }
258 
259             final byte[] recordData = slice(bytes, index, recordSize);
260             index += recordSize;
261 
262             // Debug.debug("recordSize", recordSize + " (0x"
263             // + Integer.toHexString(recordSize) + ")");
264 
265             if (recordNumber == IptcConstants.IPTC_ENVELOPE_RECORD_NUMBER && recordType == ENV_TAG_CODED_CHARACTER_SET) {
266                 charset = findCharset(recordData);
267                 continue;
268             }
269 
270             if (recordNumber != IptcConstants.IPTC_APPLICATION_2_RECORD_NUMBER) {
271                 continue;
272             }
273 
274             if (recordType == 0) {
275                 if (LOGGER.isLoggable(Level.FINE)) {
276                     LOGGER.fine("ignore record version record! " + elements.size());
277                 }
278                 // ignore "record version" record;
279                 continue;
280             }
281             // if (recordVersion == null)
282             // {
283             // // The first record in a JPEG/Photoshop IPTC block must be
284             // // the record version.
285             // if (recordType != 0)
286             // throw new ImageReadException("Missing record version: "
287             // + recordType);
288             // recordVersion = new Integer(convertByteArrayToShort(
289             // "recordNumber", recordData));
290             //
291             // if (recordSize != 2)
292             // throw new ImageReadException(
293             // "Invalid record version record size: " + recordSize);
294             //
295             // // JPEG/Photoshop IPTC metadata is always in Record version
296             // // 2
297             // if (recordVersion.intValue() != 2)
298             // throw new ImageReadException(
299             // "Invalid IPTC record version: " + recordVersion);
300             //
301             // // Debug.debug("recordVersion", recordVersion);
302             // continue;
303             // }
304 
305             final String value = new String(recordData, charset);
306 
307             final IptcType iptcType = IptcTypeLookup.getIptcType(recordType);
308 
309             // Debug.debug("iptcType", iptcType);
310             // debugByteArray("iptcData", iptcData);
311             // Debug.debug();
312 
313             // if (recordType == IPTC_TYPE_CREDIT.type
314             // || recordType == IPTC_TYPE_OBJECT_NAME.type)
315             // {
316             // this.debugByteArray("recordData", recordData);
317             // Debug.debug("index", IPTC_TYPE_CREDIT.name);
318             // }
319 
320             final IptcRecord element = new IptcRecord(iptcType, value);
321             elements.add(element);
322         }
323 
324         return elements;
325     }
326 
327     public PhotoshopApp13Data parsePhotoshopSegment(final byte[] bytes, final boolean strict) throws ImagingException, IOException {
328         final List<IptcRecord> records = new ArrayList<>();
329 
330         final List<IptcBlock> blocks = parseAllBlocks(bytes, strict);
331 
332         for (final IptcBlock block : blocks) {
333             // Ignore everything but IPTC data.
334             if (!block.isIptcBlock()) {
335                 continue;
336             }
337 
338             records.addAll(parseIptcBlock(block.getBlockData()));
339         }
340 
341         return new PhotoshopApp13Data(records, blocks);
342     }
343 
344     // private void writeIPTCRecord(BinaryOutputStream bos, )
345 
346     /*
347      * In practice, App13 segments are only used for Photoshop/IPTC metadata. However, we should not treat App13 signatures without Photoshop's signature as
348      * Photoshop/IPTC segments.
349      *
350      * A Photoshop/IPTC App13 segment begins with the Photoshop Identification string.
351      *
352      * There follows 0-N blocks (Photoshop calls them "Image Resource Blocks").
353      *
354      * Each block has the following structure:
355      *
356      * 1. 4-byte type. This is always "8BIM" for blocks in a Photoshop App13 segment. 2. 2-byte id. IPTC data is stored in blocks with id 0x0404, aka.
357      * IPTC_NAA_RECORD_IMAGE_RESOURCE_ID 3. Block name as a Pascal String. This is padded to have an even length. 4. 4-byte size (in bytes). 5. Block data. This
358      * is also padded to have an even length.
359      *
360      * The block data consists of a 0-N records. A record has the following structure:
361      *
362      * 1. 2-byte prefix. The value is always 0x1C02 2. 1-byte record type. The record types are documented by the IPTC. See IptcConstants. 3. 2-byte record size
363      * (in bytes). 4. Record data, "record size" bytes long.
364      *
365      * Record data (unlike block data) is NOT padded to have an even length.
366      *
367      * Record data, for IPTC record, should always be ISO-8859-1. But according to SANSELAN-33, this isn't always the case.
368      *
369      * The exception is the first record in the block, which must always be a record version record, whose value is a two-byte number; the value is 0x02.
370      *
371      * Some IPTC blocks are missing this first "record version" record, so we don't require it.
372      */
373     public PhotoshopApp13Data parsePhotoshopSegment(final byte[] bytes, final ImagingParameters<JpegImagingParameters> params)
374             throws ImagingException, IOException {
375         final boolean strict = params != null && params.isStrict();
376 
377         return parsePhotoshopSegment(bytes, strict);
378     }
379 
380     public byte[] writeIptcBlock(List<IptcRecord> elements) throws ImagingException, IOException {
381         Charset charset = DEFAULT_CHARSET;
382         for (final IptcRecord element : elements) {
383             final byte[] recordData = element.getValue().getBytes(charset);
384             if (!new String(recordData, charset).equals(element.getValue())) {
385                 charset = StandardCharsets.UTF_8;
386                 break;
387             }
388         }
389         final byte[] blockData;
390         final ByteArrayOutputStream baos = new ByteArrayOutputStream();
391         try (BinaryOutputStream bos = BinaryOutputStream.create(baos, getByteOrder())) {
392             if (!charset.equals(DEFAULT_CHARSET)) {
393                 bos.write(IptcConstants.IPTC_RECORD_TAG_MARKER);
394                 bos.write(IptcConstants.IPTC_ENVELOPE_RECORD_NUMBER);
395                 bos.write(ENV_TAG_CODED_CHARACTER_SET);
396                 final byte[] codedCharset = CHARACTER_ESCAPE_SEQUENCE;
397                 bos.write2Bytes(codedCharset.length);
398                 bos.write(codedCharset);
399             }
400 
401             // first, right record version record
402             bos.write(IptcConstants.IPTC_RECORD_TAG_MARKER);
403             bos.write(IptcConstants.IPTC_APPLICATION_2_RECORD_NUMBER);
404             bos.write(IptcTypes.RECORD_VERSION.type); // record version record
405                                                       // type.
406             bos.write2Bytes(2); // record version record size
407             bos.write2Bytes(2); // record version value
408 
409             // make a copy of the list.
410             elements = new ArrayList<>(elements);
411 
412             // sort the list. Records must be in numerical order.
413             final Comparator<IptcRecord> comparator = (e1, e2) -> e2.iptcType.getType() - e1.iptcType.getType();
414             elements.sort(comparator);
415             // TODO: make sure order right
416 
417             // write the list.
418             for (final IptcRecord element : elements) {
419                 if (element.iptcType == IptcTypes.RECORD_VERSION) {
420                     continue; // ignore
421                 }
422 
423                 bos.write(IptcConstants.IPTC_RECORD_TAG_MARKER);
424                 bos.write(IptcConstants.IPTC_APPLICATION_2_RECORD_NUMBER);
425                 if (element.iptcType.getType() < 0 || element.iptcType.getType() > 0xff) {
426                     throw new ImagingException("Invalid record type: " + element.iptcType.getType());
427                 }
428                 bos.write(element.iptcType.getType());
429 
430                 final byte[] recordData = element.getValue().getBytes(charset);
431                 /*
432                  * if (!new String(recordData, charset).equals(element.getValue())) { throw new ImageWriteException( "Invalid record value, not " +
433                  * charset.name()); }
434                  */
435 
436                 bos.write2Bytes(recordData.length);
437                 bos.write(recordData);
438             }
439         }
440 
441         return baos.toByteArray();
442     }
443 
444     public byte[] writePhotoshopApp13Segment(final PhotoshopApp13Data data) throws IOException, ImagingException {
445         try (ByteArrayOutputStream os = new ByteArrayOutputStream();
446                 BinaryOutputStream bos = BinaryOutputStream.bigEndian(os)) {
447 
448             JpegConstants.PHOTOSHOP_IDENTIFICATION_STRING.writeTo(bos);
449 
450             final List<IptcBlock> blocks = data.getRawBlocks();
451             for (final IptcBlock block : blocks) {
452                 bos.write4Bytes(JpegConstants.CONST_8BIM);
453 
454                 if (block.getBlockType() < 0 || block.getBlockType() > 0xffff) {
455                     throw new ImagingException("Invalid IPTC block type.");
456                 }
457                 bos.write2Bytes(block.getBlockType());
458 
459                 final byte[] blockNameBytes = block.getBlockNameBytes();
460                 if (blockNameBytes.length > 255) {
461                     throw new ImagingException("IPTC block name is too long: " + blockNameBytes.length);
462                 }
463                 bos.write(blockNameBytes.length);
464                 bos.write(blockNameBytes);
465                 if (blockNameBytes.length % 2 == 0) {
466                     bos.write(0); // pad to even size, including length byte.
467                 }
468 
469                 final byte[] blockData = block.getBlockData();
470                 if (blockData.length > IptcConstants.IPTC_NON_EXTENDED_RECORD_MAXIMUM_SIZE) {
471                     throw new ImagingException("IPTC block data is too long: " + blockData.length);
472                 }
473                 bos.write4Bytes(blockData.length);
474                 bos.write(blockData);
475                 if (blockData.length % 2 == 1) {
476                     bos.write(0); // pad to even size
477                 }
478             }
479 
480             bos.flush();
481             return os.toByteArray();
482         }
483     }
484 
485 }