001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.imaging.formats.jpeg.iptc;
019
020import static org.apache.commons.imaging.common.BinaryFunctions.read2Bytes;
021import static org.apache.commons.imaging.common.BinaryFunctions.read4Bytes;
022import static org.apache.commons.imaging.common.BinaryFunctions.readByte;
023import static org.apache.commons.imaging.common.BinaryFunctions.readBytes;
024import static org.apache.commons.imaging.common.BinaryFunctions.slice;
025import static org.apache.commons.imaging.common.BinaryFunctions.startsWith;
026
027import java.io.ByteArrayInputStream;
028import java.io.ByteArrayOutputStream;
029import java.io.IOException;
030import java.io.InputStream;
031import java.nio.ByteOrder;
032import java.nio.charset.Charset;
033import java.nio.charset.StandardCharsets;
034import java.util.ArrayList;
035import java.util.Arrays;
036import java.util.Comparator;
037import java.util.List;
038import java.util.Objects;
039import java.util.logging.Level;
040import java.util.logging.Logger;
041
042import org.apache.commons.imaging.ImagingConstants;
043import org.apache.commons.imaging.ImagingException;
044import org.apache.commons.imaging.ImagingParameters;
045import org.apache.commons.imaging.common.Allocator;
046import org.apache.commons.imaging.common.BinaryFileParser;
047import org.apache.commons.imaging.common.BinaryFunctions;
048import org.apache.commons.imaging.common.BinaryOutputStream;
049import org.apache.commons.imaging.common.ByteConversions;
050import org.apache.commons.imaging.formats.jpeg.JpegConstants;
051import org.apache.commons.imaging.formats.jpeg.JpegImagingParameters;
052import org.apache.commons.imaging.internal.Debug;
053
054public class IptcParser extends BinaryFileParser {
055
056    private static final Logger LOGGER = Logger.getLogger(IptcParser.class.getName());
057
058    private static final ByteOrder APP13_BYTE_ORDER = ByteOrder.BIG_ENDIAN;
059
060    /**
061     * Block types (or Image Resource IDs) that are not recommended to be interpreted when libraries process Photoshop IPTC metadata.
062     *
063     * @see <a href="https://www.adobe.com/devnet-apps/photoshop/fileformatashtml/">Adobe Photoshop File Formats Specification</a>
064     * @see <a href="https://issues.apache.org/jira/browse/IMAGING-246">IMAGING-246</a>
065     * @since 1.0-alpha2
066     */
067    private static final List<Integer> PHOTOSHOP_IGNORED_BLOCK_TYPE = Arrays.asList(1084, 1085, 1086, 1087);
068
069    private static final Charset DEFAULT_CHARSET = StandardCharsets.ISO_8859_1;
070    private static final int ENV_TAG_CODED_CHARACTER_SET = 90;
071    private static final byte[] CHARACTER_ESCAPE_SEQUENCE = { '\u001B', '%', 'G' };
072
073    public IptcParser() {
074        super(ByteOrder.BIG_ENDIAN);
075    }
076
077    private Charset findCharset(final byte[] codedCharset) {
078        final String codedCharsetString = new String(codedCharset, StandardCharsets.ISO_8859_1);
079        try {
080            if (Charset.isSupported(codedCharsetString)) {
081                return Charset.forName(codedCharsetString);
082            }
083        } catch (final IllegalArgumentException ignored) {
084            // ignored
085        }
086        // check if encoding is a escape sequence
087        // normalize encoding byte sequence
088        final byte[] codedCharsetNormalized = Allocator.byteArray(codedCharset.length);
089        int j = 0;
090        for (final byte element : codedCharset) {
091            if (element != ' ') {
092                codedCharsetNormalized[j++] = element;
093            }
094        }
095
096        if (Objects.deepEquals(codedCharsetNormalized, CHARACTER_ESCAPE_SEQUENCE)) {
097            return StandardCharsets.UTF_8;
098        }
099        return DEFAULT_CHARSET;
100    }
101
102    public boolean isPhotoshopJpegSegment(final byte[] segmentData) {
103        if (!startsWith(segmentData, JpegConstants.PHOTOSHOP_IDENTIFICATION_STRING)) {
104            return false;
105        }
106
107        final int index = JpegConstants.PHOTOSHOP_IDENTIFICATION_STRING.size();
108        return index + 4 <= segmentData.length && ByteConversions.toInt(segmentData, index, APP13_BYTE_ORDER) == JpegConstants.CONST_8BIM;
109    }
110
111    protected List<IptcBlock> parseAllBlocks(final byte[] bytes, final boolean strict) throws ImagingException, IOException {
112        final List<IptcBlock> blocks = new ArrayList<>();
113
114        try (InputStream bis = new ByteArrayInputStream(bytes)) {
115
116            // Note that these are unsigned quantities. Name is always an even
117            // number of bytes (including the 1st byte, which is the size.)
118
119            final byte[] idString = readBytes("", bis, JpegConstants.PHOTOSHOP_IDENTIFICATION_STRING.size(), "App13 Segment missing identification string");
120            if (!JpegConstants.PHOTOSHOP_IDENTIFICATION_STRING.equals(idString)) {
121                throw new ImagingException("Not a Photoshop App13 Segment");
122            }
123
124            // int index = PHOTOSHOP_IDENTIFICATION_STRING.length;
125
126            while (true) {
127                final int imageResourceBlockSignature;
128                try {
129                    imageResourceBlockSignature = read4Bytes("", bis, "Image Resource Block missing identification string", APP13_BYTE_ORDER);
130                } catch (final IOException ioEx) {
131                    break;
132                }
133                if (imageResourceBlockSignature != JpegConstants.CONST_8BIM) {
134                    throw new ImagingException("Invalid Image Resource Block Signature");
135                }
136
137                final int blockType = read2Bytes("", bis, "Image Resource Block missing type", APP13_BYTE_ORDER);
138                Debug.debug("blockType: " + blockType + " (0x" + Integer.toHexString(blockType) + ")");
139
140                // skip blocks that the photoshop spec recommends to, see IMAGING-246
141                if (PHOTOSHOP_IGNORED_BLOCK_TYPE.contains(blockType)) {
142                    Debug.debug("Skipping blockType: " + blockType + " (0x" + Integer.toHexString(blockType) + ")");
143                    // if there is still data in this block, before the next image resource block
144                    // (8BIM), then we must consume these bytes to leave a pointer ready to read
145                    // the next block
146                    BinaryFunctions.searchQuad(JpegConstants.CONST_8BIM, bis);
147                    continue;
148                }
149
150                final int blockNameLength = readByte("Name length", bis, "Image Resource Block missing name length");
151                if (blockNameLength > 0) {
152                    Debug.debug("blockNameLength: " + blockNameLength + " (0x" + Integer.toHexString(blockNameLength) + ")");
153                }
154                byte[] blockNameBytes;
155                if (blockNameLength == 0) {
156                    readByte("Block name bytes", bis, "Image Resource Block has invalid name");
157                    blockNameBytes = ImagingConstants.EMPTY_BYTE_ARRAY;
158                } else {
159                    try {
160                        blockNameBytes = readBytes("", bis, blockNameLength, "Invalid Image Resource Block name");
161                    } catch (final IOException ioEx) {
162                        if (strict) {
163                            throw ioEx;
164                        }
165                        break;
166                    }
167
168                    if (blockNameLength % 2 == 0) {
169                        readByte("Padding byte", bis, "Image Resource Block missing padding byte");
170                    }
171                }
172
173                final int blockSize = read4Bytes("", bis, "Image Resource Block missing size", APP13_BYTE_ORDER);
174                Debug.debug("blockSize: " + blockSize + " (0x" + Integer.toHexString(blockSize) + ")");
175
176                /*
177                 * doesn't catch cases where blocksize is invalid but is still less than bytes.length but will at least prevent OutOfMemory errors
178                 */
179                if (blockSize > bytes.length) {
180                    throw new ImagingException("Invalid Block Size : " + blockSize + " > " + bytes.length);
181                }
182
183                final byte[] blockData;
184                try {
185                    blockData = readBytes("", bis, blockSize, "Invalid Image Resource Block data");
186                } catch (final IOException ioEx) {
187                    if (strict) {
188                        throw ioEx;
189                    }
190                    break;
191                }
192
193                blocks.add(new IptcBlock(blockType, blockNameBytes, blockData));
194
195                if (blockSize % 2 != 0) {
196                    readByte("Padding byte", bis, "Image Resource Block missing padding byte");
197                }
198            }
199
200            return blocks;
201        }
202    }
203
204    protected List<IptcRecord> parseIptcBlock(final byte[] bytes) {
205        Charset charset = DEFAULT_CHARSET;
206        final List<IptcRecord> elements = new ArrayList<>();
207
208        int index = 0;
209        // Integer recordVersion = null;
210        while (index + 1 < bytes.length) {
211            final int tagMarker = 0xff & bytes[index++];
212            Debug.debug("tagMarker: " + tagMarker + " (0x" + Integer.toHexString(tagMarker) + ")");
213
214            if (tagMarker != IptcConstants.IPTC_RECORD_TAG_MARKER) {
215                if (LOGGER.isLoggable(Level.FINE)) {
216                    LOGGER.fine("Unexpected record tag marker in IPTC data.");
217                }
218                return elements;
219            }
220
221            final int recordNumber = 0xff & bytes[index++];
222            Debug.debug("recordNumber: " + recordNumber + " (0x" + Integer.toHexString(recordNumber) + ")");
223
224            // int recordPrefix = convertByteArrayToShort("recordPrefix", index,
225            // bytes);
226            // if (verbose)
227            // Debug.debug("recordPrefix", recordPrefix + " (0x"
228            // + Integer.toHexString(recordPrefix) + ")");
229            // index += 2;
230            //
231            // if (recordPrefix != IPTC_RECORD_PREFIX)
232            // {
233            // if (verbose)
234            // System.out
235            // .println("Unexpected record prefix in IPTC data!");
236            // return elements;
237            // }
238
239            // throw new ImageReadException(
240            // "Unexpected record prefix in IPTC data.");
241
242            final int recordType = 0xff & bytes[index];
243            Debug.debug("recordType: " + recordType + " (0x" + Integer.toHexString(recordType) + ")");
244            index++;
245
246            final int recordSize = ByteConversions.toUInt16(bytes, index, getByteOrder());
247            index += 2;
248
249            final boolean extendedDataset = recordSize > IptcConstants.IPTC_NON_EXTENDED_RECORD_MAXIMUM_SIZE;
250            final int dataFieldCountLength = recordSize & 0x7fff;
251            if (extendedDataset) {
252                Debug.debug("extendedDataset. dataFieldCountLength: " + dataFieldCountLength);
253            }
254            if (extendedDataset) {
255                // ignore extended dataset and everything after.
256                return elements;
257            }
258
259            final byte[] recordData = slice(bytes, index, recordSize);
260            index += recordSize;
261
262            // Debug.debug("recordSize", recordSize + " (0x"
263            // + Integer.toHexString(recordSize) + ")");
264
265            if (recordNumber == IptcConstants.IPTC_ENVELOPE_RECORD_NUMBER && recordType == ENV_TAG_CODED_CHARACTER_SET) {
266                charset = findCharset(recordData);
267                continue;
268            }
269
270            if (recordNumber != IptcConstants.IPTC_APPLICATION_2_RECORD_NUMBER) {
271                continue;
272            }
273
274            if (recordType == 0) {
275                if (LOGGER.isLoggable(Level.FINE)) {
276                    LOGGER.fine("ignore record version record! " + elements.size());
277                }
278                // ignore "record version" record;
279                continue;
280            }
281            // if (recordVersion == null)
282            // {
283            // // The first record in a JPEG/Photoshop IPTC block must be
284            // // the record version.
285            // if (recordType != 0)
286            // throw new ImageReadException("Missing record version: "
287            // + recordType);
288            // recordVersion = new Integer(convertByteArrayToShort(
289            // "recordNumber", recordData));
290            //
291            // if (recordSize != 2)
292            // throw new ImageReadException(
293            // "Invalid record version record size: " + recordSize);
294            //
295            // // JPEG/Photoshop IPTC metadata is always in Record version
296            // // 2
297            // if (recordVersion.intValue() != 2)
298            // throw new ImageReadException(
299            // "Invalid IPTC record version: " + recordVersion);
300            //
301            // // Debug.debug("recordVersion", recordVersion);
302            // continue;
303            // }
304
305            final String value = new String(recordData, charset);
306
307            final IptcType iptcType = IptcTypeLookup.getIptcType(recordType);
308
309            // Debug.debug("iptcType", iptcType);
310            // debugByteArray("iptcData", iptcData);
311            // Debug.debug();
312
313            // if (recordType == IPTC_TYPE_CREDIT.type
314            // || recordType == IPTC_TYPE_OBJECT_NAME.type)
315            // {
316            // this.debugByteArray("recordData", recordData);
317            // Debug.debug("index", IPTC_TYPE_CREDIT.name);
318            // }
319
320            final IptcRecord element = new IptcRecord(iptcType, value);
321            elements.add(element);
322        }
323
324        return elements;
325    }
326
327    public PhotoshopApp13Data parsePhotoshopSegment(final byte[] bytes, final boolean strict) throws ImagingException, IOException {
328        final List<IptcRecord> records = new ArrayList<>();
329
330        final List<IptcBlock> blocks = parseAllBlocks(bytes, strict);
331
332        for (final IptcBlock block : blocks) {
333            // Ignore everything but IPTC data.
334            if (!block.isIptcBlock()) {
335                continue;
336            }
337
338            records.addAll(parseIptcBlock(block.getBlockData()));
339        }
340
341        return new PhotoshopApp13Data(records, blocks);
342    }
343
344    // private void writeIPTCRecord(BinaryOutputStream bos, )
345
346    /*
347     * In practice, App13 segments are only used for Photoshop/IPTC metadata. However, we should not treat App13 signatures without Photoshop's signature as
348     * Photoshop/IPTC segments.
349     *
350     * A Photoshop/IPTC App13 segment begins with the Photoshop Identification string.
351     *
352     * There follows 0-N blocks (Photoshop calls them "Image Resource Blocks").
353     *
354     * Each block has the following structure:
355     *
356     * 1. 4-byte type. This is always "8BIM" for blocks in a Photoshop App13 segment. 2. 2-byte id. IPTC data is stored in blocks with id 0x0404, aka.
357     * IPTC_NAA_RECORD_IMAGE_RESOURCE_ID 3. Block name as a Pascal String. This is padded to have an even length. 4. 4-byte size (in bytes). 5. Block data. This
358     * is also padded to have an even length.
359     *
360     * The block data consists of a 0-N records. A record has the following structure:
361     *
362     * 1. 2-byte prefix. The value is always 0x1C02 2. 1-byte record type. The record types are documented by the IPTC. See IptcConstants. 3. 2-byte record size
363     * (in bytes). 4. Record data, "record size" bytes long.
364     *
365     * Record data (unlike block data) is NOT padded to have an even length.
366     *
367     * Record data, for IPTC record, should always be ISO-8859-1. But according to SANSELAN-33, this isn't always the case.
368     *
369     * The exception is the first record in the block, which must always be a record version record, whose value is a two-byte number; the value is 0x02.
370     *
371     * Some IPTC blocks are missing this first "record version" record, so we don't require it.
372     */
373    public PhotoshopApp13Data parsePhotoshopSegment(final byte[] bytes, final ImagingParameters<JpegImagingParameters> params)
374            throws ImagingException, IOException {
375        final boolean strict = params != null && params.isStrict();
376
377        return parsePhotoshopSegment(bytes, strict);
378    }
379
380    public byte[] writeIptcBlock(List<IptcRecord> elements) throws ImagingException, IOException {
381        Charset charset = DEFAULT_CHARSET;
382        for (final IptcRecord element : elements) {
383            final byte[] recordData = element.getValue().getBytes(charset);
384            if (!new String(recordData, charset).equals(element.getValue())) {
385                charset = StandardCharsets.UTF_8;
386                break;
387            }
388        }
389        final byte[] blockData;
390        final ByteArrayOutputStream baos = new ByteArrayOutputStream();
391        try (BinaryOutputStream bos = BinaryOutputStream.create(baos, getByteOrder())) {
392            if (!charset.equals(DEFAULT_CHARSET)) {
393                bos.write(IptcConstants.IPTC_RECORD_TAG_MARKER);
394                bos.write(IptcConstants.IPTC_ENVELOPE_RECORD_NUMBER);
395                bos.write(ENV_TAG_CODED_CHARACTER_SET);
396                final byte[] codedCharset = CHARACTER_ESCAPE_SEQUENCE;
397                bos.write2Bytes(codedCharset.length);
398                bos.write(codedCharset);
399            }
400
401            // first, right record version record
402            bos.write(IptcConstants.IPTC_RECORD_TAG_MARKER);
403            bos.write(IptcConstants.IPTC_APPLICATION_2_RECORD_NUMBER);
404            bos.write(IptcTypes.RECORD_VERSION.type); // record version record
405                                                      // type.
406            bos.write2Bytes(2); // record version record size
407            bos.write2Bytes(2); // record version value
408
409            // make a copy of the list.
410            elements = new ArrayList<>(elements);
411
412            // sort the list. Records must be in numerical order.
413            final Comparator<IptcRecord> comparator = (e1, e2) -> e2.iptcType.getType() - e1.iptcType.getType();
414            elements.sort(comparator);
415            // TODO: make sure order right
416
417            // write the list.
418            for (final IptcRecord element : elements) {
419                if (element.iptcType == IptcTypes.RECORD_VERSION) {
420                    continue; // ignore
421                }
422
423                bos.write(IptcConstants.IPTC_RECORD_TAG_MARKER);
424                bos.write(IptcConstants.IPTC_APPLICATION_2_RECORD_NUMBER);
425                if (element.iptcType.getType() < 0 || element.iptcType.getType() > 0xff) {
426                    throw new ImagingException("Invalid record type: " + element.iptcType.getType());
427                }
428                bos.write(element.iptcType.getType());
429
430                final byte[] recordData = element.getValue().getBytes(charset);
431                /*
432                 * if (!new String(recordData, charset).equals(element.getValue())) { throw new ImageWriteException( "Invalid record value, not " +
433                 * charset.name()); }
434                 */
435
436                bos.write2Bytes(recordData.length);
437                bos.write(recordData);
438            }
439        }
440
441        return baos.toByteArray();
442    }
443
444    public byte[] writePhotoshopApp13Segment(final PhotoshopApp13Data data) throws IOException, ImagingException {
445        try (ByteArrayOutputStream os = new ByteArrayOutputStream();
446                BinaryOutputStream bos = BinaryOutputStream.bigEndian(os)) {
447
448            JpegConstants.PHOTOSHOP_IDENTIFICATION_STRING.writeTo(bos);
449
450            final List<IptcBlock> blocks = data.getRawBlocks();
451            for (final IptcBlock block : blocks) {
452                bos.write4Bytes(JpegConstants.CONST_8BIM);
453
454                if (block.getBlockType() < 0 || block.getBlockType() > 0xffff) {
455                    throw new ImagingException("Invalid IPTC block type.");
456                }
457                bos.write2Bytes(block.getBlockType());
458
459                final byte[] blockNameBytes = block.getBlockNameBytes();
460                if (blockNameBytes.length > 255) {
461                    throw new ImagingException("IPTC block name is too long: " + blockNameBytes.length);
462                }
463                bos.write(blockNameBytes.length);
464                bos.write(blockNameBytes);
465                if (blockNameBytes.length % 2 == 0) {
466                    bos.write(0); // pad to even size, including length byte.
467                }
468
469                final byte[] blockData = block.getBlockData();
470                if (blockData.length > IptcConstants.IPTC_NON_EXTENDED_RECORD_MAXIMUM_SIZE) {
471                    throw new ImagingException("IPTC block data is too long: " + blockData.length);
472                }
473                bos.write4Bytes(blockData.length);
474                bos.write(blockData);
475                if (blockData.length % 2 == 1) {
476                    bos.write(0); // pad to even size
477                }
478            }
479
480            bos.flush();
481            return os.toByteArray();
482        }
483    }
484
485}