001/* 002 * Archives Unleashed Toolkit (AUT): 003 * An open-source platform for analyzing web archives. 004 * 005 * Licensed under the Apache License, Version 2.0 (the "License"); 006 * you may not use this file except in compliance with the License. 007 * You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package io.archivesunleashed.data; 018 019import java.io.BufferedInputStream; 020import java.io.ByteArrayInputStream; 021import java.io.ByteArrayOutputStream; 022import java.io.DataOutputStream; 023import java.io.IOException; 024import java.io.InputStream; 025import org.apache.commons.io.IOUtils; 026import org.apache.commons.io.input.BoundedInputStream; 027import org.apache.log4j.Logger; 028import org.archive.io.arc.ARCReader; 029import org.archive.io.arc.ARCReaderFactory; 030import org.archive.io.arc.ARCRecord; 031import org.archive.io.arc.ARCRecordMetaData; 032 033/** 034 * Utilities for working with {@code ARCRecord}s (from archive.org APIs). 035 */ 036public final class ArcRecordUtils { 037 038 /** 039 * Utility classes should not have a public or default constructor. 040 */ 041 private ArcRecordUtils() { 042 } 043 044 /** 045 * Setup logger. 046 */ 047 private static final Logger LOG = Logger.getLogger(ArcRecordUtils.class); 048 049 /** 050 * Converts raw bytes into an {@code ARCRecord}. 051 * 052 * @param bytes raw bytes 053 * @return parsed {@code ARCRecord} 054 * @throws IOException if there is an issue 055 */ 056 public static ARCRecord fromBytes(final byte[] bytes) throws IOException { 057 ARCReader reader = (ARCReader) ARCReaderFactory.get("", 058 new BufferedInputStream(new ByteArrayInputStream(bytes)), false); 059 return (ARCRecord) reader.get(); 060 } 061 062 /** 063 * Converts ARC record into raw bytes. 064 * 065 * @param record conents of WARC response record 066 * @return raw contents 067 * @throws IOException if there is an issue 068 */ 069 public static byte[] toBytes(final ARCRecord record) throws IOException { 070 ARCRecordMetaData meta = record.getMetaData(); 071 072 String metaline = meta.getUrl() + " " + meta.getIp() 073 + " " + meta.getDate() + " " + meta.getMimetype() 074 + " " + (int) meta.getLength(); 075 076 ByteArrayOutputStream baos = new ByteArrayOutputStream(); 077 DataOutputStream dout = new DataOutputStream(baos); 078 dout.write(metaline.getBytes()); 079 dout.write("\n".getBytes()); 080 081 long recordLength = meta.getLength(); 082 long len = IOUtils.copyLarge(new BoundedInputStream(record, recordLength), 083 dout); 084 if (len != recordLength) { 085 LOG.error("Read " + len + " bytes but expected " + recordLength 086 + " bytes. Continuing..."); 087 } 088 return baos.toByteArray(); 089 } 090 091 /** 092 * Extracts raw contents from an {@code ARCRecord} (including HTTP headers). 093 * 094 * @param record the {@code ARCRecord} 095 * @return raw contents 096 * @throws IOException if there is an issue 097 */ 098 public static byte[] getContent(final ARCRecord record) throws IOException { 099 ARCRecordMetaData meta = record.getMetaData(); 100 101 return copyToByteArray(record, (int) meta.getLength(), true); 102 } 103 104 /** 105 * Extracts contents of the body from an {@code ARCRecord}. 106 * Excludes HTTP headers. 107 * 108 * @param record the {@code ARCRecord} 109 * @return contents of the body 110 * @throws IOException if there is an issue 111 */ 112 public static byte[] getBodyContent(final ARCRecord record) 113 throws IOException { 114 byte[] raw = getContent(record); 115 int bodyOffset = record.getBodyOffset(); 116 117 byte[] content = null; 118 try { 119 content = new byte[raw.length - bodyOffset]; 120 System.arraycopy(raw, bodyOffset, content, 0, content.length); 121 } catch (java.lang.NegativeArraySizeException e) { 122 // To find out what URL causing the error: record.getMetaData().getUrl() 123 // For some records, we're missing the actual content data, likely due 124 // to a crawler gitch. Nothing much we can do, just swallow and move on. 125 content = new byte[0]; 126 } 127 return content; 128 } 129 130 /** 131 * Copies contents to a byte array. 132 * 133 * @param is raw input stream 134 * @param recordLength is length of a record 135 * @param enforceLength enforce the length 136 * @return rawContents of body 137 * @throws IOException if there is an issue 138 */ 139 private static byte[] copyToByteArray(final InputStream is, 140 final int recordLength, final boolean enforceLength) 141 throws IOException { 142 143 BoundedInputStream bis = new BoundedInputStream(is, recordLength); 144 byte[] rawContents = IOUtils.toByteArray(bis); 145 if (enforceLength && rawContents.length != recordLength) { 146 LOG.error("Read " + rawContents.length + " bytes but expected " 147 + recordLength + " bytes. Continuing..."); 148 } 149 return rawContents; 150 } 151}