001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.imaging.formats.jpeg.iptc; 019 020import static org.apache.commons.imaging.common.BinaryFunctions.read2Bytes; 021import static org.apache.commons.imaging.common.BinaryFunctions.read4Bytes; 022import static org.apache.commons.imaging.common.BinaryFunctions.readByte; 023import static org.apache.commons.imaging.common.BinaryFunctions.readBytes; 024import static org.apache.commons.imaging.common.BinaryFunctions.slice; 025import static org.apache.commons.imaging.common.BinaryFunctions.startsWith; 026 027import java.io.ByteArrayInputStream; 028import java.io.ByteArrayOutputStream; 029import java.io.IOException; 030import java.io.InputStream; 031import java.nio.ByteOrder; 032import java.nio.charset.Charset; 033import java.nio.charset.StandardCharsets; 034import java.util.ArrayList; 035import java.util.Arrays; 036import java.util.Comparator; 037import java.util.List; 038import java.util.Objects; 039import java.util.logging.Level; 040import java.util.logging.Logger; 041 042import org.apache.commons.imaging.ImagingConstants; 043import org.apache.commons.imaging.ImagingException; 044import org.apache.commons.imaging.ImagingParameters; 045import org.apache.commons.imaging.common.Allocator; 046import org.apache.commons.imaging.common.BinaryFileParser; 047import org.apache.commons.imaging.common.BinaryFunctions; 048import org.apache.commons.imaging.common.BinaryOutputStream; 049import org.apache.commons.imaging.common.ByteConversions; 050import org.apache.commons.imaging.formats.jpeg.JpegConstants; 051import org.apache.commons.imaging.formats.jpeg.JpegImagingParameters; 052import org.apache.commons.imaging.internal.Debug; 053 054public class IptcParser extends BinaryFileParser { 055 056 private static final Logger LOGGER = Logger.getLogger(IptcParser.class.getName()); 057 058 private static final ByteOrder APP13_BYTE_ORDER = ByteOrder.BIG_ENDIAN; 059 060 /** 061 * Block types (or Image Resource IDs) that are not recommended to be interpreted when libraries process Photoshop IPTC metadata. 062 * 063 * @see <a href="https://www.adobe.com/devnet-apps/photoshop/fileformatashtml/">Adobe Photoshop File Formats Specification</a> 064 * @see <a href="https://issues.apache.org/jira/browse/IMAGING-246">IMAGING-246</a> 065 * @since 1.0-alpha2 066 */ 067 private static final List<Integer> PHOTOSHOP_IGNORED_BLOCK_TYPE = Arrays.asList(1084, 1085, 1086, 1087); 068 069 private static final Charset DEFAULT_CHARSET = StandardCharsets.ISO_8859_1; 070 private static final int ENV_TAG_CODED_CHARACTER_SET = 90; 071 private static final byte[] CHARACTER_ESCAPE_SEQUENCE = { '\u001B', '%', 'G' }; 072 073 public IptcParser() { 074 super(ByteOrder.BIG_ENDIAN); 075 } 076 077 private Charset findCharset(final byte[] codedCharset) { 078 final String codedCharsetString = new String(codedCharset, StandardCharsets.ISO_8859_1); 079 try { 080 if (Charset.isSupported(codedCharsetString)) { 081 return Charset.forName(codedCharsetString); 082 } 083 } catch (final IllegalArgumentException ignored) { 084 // ignored 085 } 086 // check if encoding is a escape sequence 087 // normalize encoding byte sequence 088 final byte[] codedCharsetNormalized = Allocator.byteArray(codedCharset.length); 089 int j = 0; 090 for (final byte element : codedCharset) { 091 if (element != ' ') { 092 codedCharsetNormalized[j++] = element; 093 } 094 } 095 096 if (Objects.deepEquals(codedCharsetNormalized, CHARACTER_ESCAPE_SEQUENCE)) { 097 return StandardCharsets.UTF_8; 098 } 099 return DEFAULT_CHARSET; 100 } 101 102 public boolean isPhotoshopJpegSegment(final byte[] segmentData) { 103 if (!startsWith(segmentData, JpegConstants.PHOTOSHOP_IDENTIFICATION_STRING)) { 104 return false; 105 } 106 107 final int index = JpegConstants.PHOTOSHOP_IDENTIFICATION_STRING.size(); 108 return index + 4 <= segmentData.length && ByteConversions.toInt(segmentData, index, APP13_BYTE_ORDER) == JpegConstants.CONST_8BIM; 109 } 110 111 protected List<IptcBlock> parseAllBlocks(final byte[] bytes, final boolean strict) throws ImagingException, IOException { 112 final List<IptcBlock> blocks = new ArrayList<>(); 113 114 try (InputStream bis = new ByteArrayInputStream(bytes)) { 115 116 // Note that these are unsigned quantities. Name is always an even 117 // number of bytes (including the 1st byte, which is the size.) 118 119 final byte[] idString = readBytes("", bis, JpegConstants.PHOTOSHOP_IDENTIFICATION_STRING.size(), "App13 Segment missing identification string"); 120 if (!JpegConstants.PHOTOSHOP_IDENTIFICATION_STRING.equals(idString)) { 121 throw new ImagingException("Not a Photoshop App13 Segment"); 122 } 123 124 // int index = PHOTOSHOP_IDENTIFICATION_STRING.length; 125 126 while (true) { 127 final int imageResourceBlockSignature; 128 try { 129 imageResourceBlockSignature = read4Bytes("", bis, "Image Resource Block missing identification string", APP13_BYTE_ORDER); 130 } catch (final IOException ioEx) { 131 break; 132 } 133 if (imageResourceBlockSignature != JpegConstants.CONST_8BIM) { 134 throw new ImagingException("Invalid Image Resource Block Signature"); 135 } 136 137 final int blockType = read2Bytes("", bis, "Image Resource Block missing type", APP13_BYTE_ORDER); 138 Debug.debug("blockType: " + blockType + " (0x" + Integer.toHexString(blockType) + ")"); 139 140 // skip blocks that the photoshop spec recommends to, see IMAGING-246 141 if (PHOTOSHOP_IGNORED_BLOCK_TYPE.contains(blockType)) { 142 Debug.debug("Skipping blockType: " + blockType + " (0x" + Integer.toHexString(blockType) + ")"); 143 // if there is still data in this block, before the next image resource block 144 // (8BIM), then we must consume these bytes to leave a pointer ready to read 145 // the next block 146 BinaryFunctions.searchQuad(JpegConstants.CONST_8BIM, bis); 147 continue; 148 } 149 150 final int blockNameLength = readByte("Name length", bis, "Image Resource Block missing name length"); 151 if (blockNameLength > 0) { 152 Debug.debug("blockNameLength: " + blockNameLength + " (0x" + Integer.toHexString(blockNameLength) + ")"); 153 } 154 byte[] blockNameBytes; 155 if (blockNameLength == 0) { 156 readByte("Block name bytes", bis, "Image Resource Block has invalid name"); 157 blockNameBytes = ImagingConstants.EMPTY_BYTE_ARRAY; 158 } else { 159 try { 160 blockNameBytes = readBytes("", bis, blockNameLength, "Invalid Image Resource Block name"); 161 } catch (final IOException ioEx) { 162 if (strict) { 163 throw ioEx; 164 } 165 break; 166 } 167 168 if (blockNameLength % 2 == 0) { 169 readByte("Padding byte", bis, "Image Resource Block missing padding byte"); 170 } 171 } 172 173 final int blockSize = read4Bytes("", bis, "Image Resource Block missing size", APP13_BYTE_ORDER); 174 Debug.debug("blockSize: " + blockSize + " (0x" + Integer.toHexString(blockSize) + ")"); 175 176 /* 177 * doesn't catch cases where blocksize is invalid but is still less than bytes.length but will at least prevent OutOfMemory errors 178 */ 179 if (blockSize > bytes.length) { 180 throw new ImagingException("Invalid Block Size : " + blockSize + " > " + bytes.length); 181 } 182 183 final byte[] blockData; 184 try { 185 blockData = readBytes("", bis, blockSize, "Invalid Image Resource Block data"); 186 } catch (final IOException ioEx) { 187 if (strict) { 188 throw ioEx; 189 } 190 break; 191 } 192 193 blocks.add(new IptcBlock(blockType, blockNameBytes, blockData)); 194 195 if (blockSize % 2 != 0) { 196 readByte("Padding byte", bis, "Image Resource Block missing padding byte"); 197 } 198 } 199 200 return blocks; 201 } 202 } 203 204 protected List<IptcRecord> parseIptcBlock(final byte[] bytes) { 205 Charset charset = DEFAULT_CHARSET; 206 final List<IptcRecord> elements = new ArrayList<>(); 207 208 int index = 0; 209 // Integer recordVersion = null; 210 while (index + 1 < bytes.length) { 211 final int tagMarker = 0xff & bytes[index++]; 212 Debug.debug("tagMarker: " + tagMarker + " (0x" + Integer.toHexString(tagMarker) + ")"); 213 214 if (tagMarker != IptcConstants.IPTC_RECORD_TAG_MARKER) { 215 if (LOGGER.isLoggable(Level.FINE)) { 216 LOGGER.fine("Unexpected record tag marker in IPTC data."); 217 } 218 return elements; 219 } 220 221 final int recordNumber = 0xff & bytes[index++]; 222 Debug.debug("recordNumber: " + recordNumber + " (0x" + Integer.toHexString(recordNumber) + ")"); 223 224 // int recordPrefix = convertByteArrayToShort("recordPrefix", index, 225 // bytes); 226 // if (verbose) 227 // Debug.debug("recordPrefix", recordPrefix + " (0x" 228 // + Integer.toHexString(recordPrefix) + ")"); 229 // index += 2; 230 // 231 // if (recordPrefix != IPTC_RECORD_PREFIX) 232 // { 233 // if (verbose) 234 // System.out 235 // .println("Unexpected record prefix in IPTC data!"); 236 // return elements; 237 // } 238 239 // throw new ImageReadException( 240 // "Unexpected record prefix in IPTC data."); 241 242 final int recordType = 0xff & bytes[index]; 243 Debug.debug("recordType: " + recordType + " (0x" + Integer.toHexString(recordType) + ")"); 244 index++; 245 246 final int recordSize = ByteConversions.toUInt16(bytes, index, getByteOrder()); 247 index += 2; 248 249 final boolean extendedDataset = recordSize > IptcConstants.IPTC_NON_EXTENDED_RECORD_MAXIMUM_SIZE; 250 final int dataFieldCountLength = recordSize & 0x7fff; 251 if (extendedDataset) { 252 Debug.debug("extendedDataset. dataFieldCountLength: " + dataFieldCountLength); 253 } 254 if (extendedDataset) { 255 // ignore extended dataset and everything after. 256 return elements; 257 } 258 259 final byte[] recordData = slice(bytes, index, recordSize); 260 index += recordSize; 261 262 // Debug.debug("recordSize", recordSize + " (0x" 263 // + Integer.toHexString(recordSize) + ")"); 264 265 if (recordNumber == IptcConstants.IPTC_ENVELOPE_RECORD_NUMBER && recordType == ENV_TAG_CODED_CHARACTER_SET) { 266 charset = findCharset(recordData); 267 continue; 268 } 269 270 if (recordNumber != IptcConstants.IPTC_APPLICATION_2_RECORD_NUMBER) { 271 continue; 272 } 273 274 if (recordType == 0) { 275 if (LOGGER.isLoggable(Level.FINE)) { 276 LOGGER.fine("ignore record version record! " + elements.size()); 277 } 278 // ignore "record version" record; 279 continue; 280 } 281 // if (recordVersion == null) 282 // { 283 // // The first record in a JPEG/Photoshop IPTC block must be 284 // // the record version. 285 // if (recordType != 0) 286 // throw new ImageReadException("Missing record version: " 287 // + recordType); 288 // recordVersion = new Integer(convertByteArrayToShort( 289 // "recordNumber", recordData)); 290 // 291 // if (recordSize != 2) 292 // throw new ImageReadException( 293 // "Invalid record version record size: " + recordSize); 294 // 295 // // JPEG/Photoshop IPTC metadata is always in Record version 296 // // 2 297 // if (recordVersion.intValue() != 2) 298 // throw new ImageReadException( 299 // "Invalid IPTC record version: " + recordVersion); 300 // 301 // // Debug.debug("recordVersion", recordVersion); 302 // continue; 303 // } 304 305 final String value = new String(recordData, charset); 306 307 final IptcType iptcType = IptcTypeLookup.getIptcType(recordType); 308 309 // Debug.debug("iptcType", iptcType); 310 // debugByteArray("iptcData", iptcData); 311 // Debug.debug(); 312 313 // if (recordType == IPTC_TYPE_CREDIT.type 314 // || recordType == IPTC_TYPE_OBJECT_NAME.type) 315 // { 316 // this.debugByteArray("recordData", recordData); 317 // Debug.debug("index", IPTC_TYPE_CREDIT.name); 318 // } 319 320 final IptcRecord element = new IptcRecord(iptcType, value); 321 elements.add(element); 322 } 323 324 return elements; 325 } 326 327 public PhotoshopApp13Data parsePhotoshopSegment(final byte[] bytes, final boolean strict) throws ImagingException, IOException { 328 final List<IptcRecord> records = new ArrayList<>(); 329 330 final List<IptcBlock> blocks = parseAllBlocks(bytes, strict); 331 332 for (final IptcBlock block : blocks) { 333 // Ignore everything but IPTC data. 334 if (!block.isIptcBlock()) { 335 continue; 336 } 337 338 records.addAll(parseIptcBlock(block.getBlockData())); 339 } 340 341 return new PhotoshopApp13Data(records, blocks); 342 } 343 344 // private void writeIPTCRecord(BinaryOutputStream bos, ) 345 346 /* 347 * In practice, App13 segments are only used for Photoshop/IPTC metadata. However, we should not treat App13 signatures without Photoshop's signature as 348 * Photoshop/IPTC segments. 349 * 350 * A Photoshop/IPTC App13 segment begins with the Photoshop Identification string. 351 * 352 * There follows 0-N blocks (Photoshop calls them "Image Resource Blocks"). 353 * 354 * Each block has the following structure: 355 * 356 * 1. 4-byte type. This is always "8BIM" for blocks in a Photoshop App13 segment. 2. 2-byte id. IPTC data is stored in blocks with id 0x0404, aka. 357 * IPTC_NAA_RECORD_IMAGE_RESOURCE_ID 3. Block name as a Pascal String. This is padded to have an even length. 4. 4-byte size (in bytes). 5. Block data. This 358 * is also padded to have an even length. 359 * 360 * The block data consists of a 0-N records. A record has the following structure: 361 * 362 * 1. 2-byte prefix. The value is always 0x1C02 2. 1-byte record type. The record types are documented by the IPTC. See IptcConstants. 3. 2-byte record size 363 * (in bytes). 4. Record data, "record size" bytes long. 364 * 365 * Record data (unlike block data) is NOT padded to have an even length. 366 * 367 * Record data, for IPTC record, should always be ISO-8859-1. But according to SANSELAN-33, this isn't always the case. 368 * 369 * The exception is the first record in the block, which must always be a record version record, whose value is a two-byte number; the value is 0x02. 370 * 371 * Some IPTC blocks are missing this first "record version" record, so we don't require it. 372 */ 373 public PhotoshopApp13Data parsePhotoshopSegment(final byte[] bytes, final ImagingParameters<JpegImagingParameters> params) 374 throws ImagingException, IOException { 375 final boolean strict = params != null && params.isStrict(); 376 377 return parsePhotoshopSegment(bytes, strict); 378 } 379 380 public byte[] writeIptcBlock(List<IptcRecord> elements) throws ImagingException, IOException { 381 Charset charset = DEFAULT_CHARSET; 382 for (final IptcRecord element : elements) { 383 final byte[] recordData = element.getValue().getBytes(charset); 384 if (!new String(recordData, charset).equals(element.getValue())) { 385 charset = StandardCharsets.UTF_8; 386 break; 387 } 388 } 389 final byte[] blockData; 390 final ByteArrayOutputStream baos = new ByteArrayOutputStream(); 391 try (BinaryOutputStream bos = BinaryOutputStream.create(baos, getByteOrder())) { 392 if (!charset.equals(DEFAULT_CHARSET)) { 393 bos.write(IptcConstants.IPTC_RECORD_TAG_MARKER); 394 bos.write(IptcConstants.IPTC_ENVELOPE_RECORD_NUMBER); 395 bos.write(ENV_TAG_CODED_CHARACTER_SET); 396 final byte[] codedCharset = CHARACTER_ESCAPE_SEQUENCE; 397 bos.write2Bytes(codedCharset.length); 398 bos.write(codedCharset); 399 } 400 401 // first, right record version record 402 bos.write(IptcConstants.IPTC_RECORD_TAG_MARKER); 403 bos.write(IptcConstants.IPTC_APPLICATION_2_RECORD_NUMBER); 404 bos.write(IptcTypes.RECORD_VERSION.type); // record version record 405 // type. 406 bos.write2Bytes(2); // record version record size 407 bos.write2Bytes(2); // record version value 408 409 // make a copy of the list. 410 elements = new ArrayList<>(elements); 411 412 // sort the list. Records must be in numerical order. 413 final Comparator<IptcRecord> comparator = (e1, e2) -> e2.iptcType.getType() - e1.iptcType.getType(); 414 elements.sort(comparator); 415 // TODO: make sure order right 416 417 // write the list. 418 for (final IptcRecord element : elements) { 419 if (element.iptcType == IptcTypes.RECORD_VERSION) { 420 continue; // ignore 421 } 422 423 bos.write(IptcConstants.IPTC_RECORD_TAG_MARKER); 424 bos.write(IptcConstants.IPTC_APPLICATION_2_RECORD_NUMBER); 425 if (element.iptcType.getType() < 0 || element.iptcType.getType() > 0xff) { 426 throw new ImagingException("Invalid record type: " + element.iptcType.getType()); 427 } 428 bos.write(element.iptcType.getType()); 429 430 final byte[] recordData = element.getValue().getBytes(charset); 431 /* 432 * if (!new String(recordData, charset).equals(element.getValue())) { throw new ImageWriteException( "Invalid record value, not " + 433 * charset.name()); } 434 */ 435 436 bos.write2Bytes(recordData.length); 437 bos.write(recordData); 438 } 439 } 440 441 return baos.toByteArray(); 442 } 443 444 public byte[] writePhotoshopApp13Segment(final PhotoshopApp13Data data) throws IOException, ImagingException { 445 try (ByteArrayOutputStream os = new ByteArrayOutputStream(); 446 BinaryOutputStream bos = BinaryOutputStream.bigEndian(os)) { 447 448 JpegConstants.PHOTOSHOP_IDENTIFICATION_STRING.writeTo(bos); 449 450 final List<IptcBlock> blocks = data.getRawBlocks(); 451 for (final IptcBlock block : blocks) { 452 bos.write4Bytes(JpegConstants.CONST_8BIM); 453 454 if (block.getBlockType() < 0 || block.getBlockType() > 0xffff) { 455 throw new ImagingException("Invalid IPTC block type."); 456 } 457 bos.write2Bytes(block.getBlockType()); 458 459 final byte[] blockNameBytes = block.getBlockNameBytes(); 460 if (blockNameBytes.length > 255) { 461 throw new ImagingException("IPTC block name is too long: " + blockNameBytes.length); 462 } 463 bos.write(blockNameBytes.length); 464 bos.write(blockNameBytes); 465 if (blockNameBytes.length % 2 == 0) { 466 bos.write(0); // pad to even size, including length byte. 467 } 468 469 final byte[] blockData = block.getBlockData(); 470 if (blockData.length > IptcConstants.IPTC_NON_EXTENDED_RECORD_MAXIMUM_SIZE) { 471 throw new ImagingException("IPTC block data is too long: " + blockData.length); 472 } 473 bos.write4Bytes(blockData.length); 474 bos.write(blockData); 475 if (blockData.length % 2 == 1) { 476 bos.write(0); // pad to even size 477 } 478 } 479 480 bos.flush(); 481 return os.toByteArray(); 482 } 483 } 484 485}