001/* 002 * Archives Unleashed Toolkit (AUT): 003 * An open-source platform for analyzing web archives. 004 * 005 * Licensed under the Apache License, Version 2.0 (the "License"); 006 * you may not use this file except in compliance with the License. 007 * You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package io.archivesunleashed.data; 018 019import java.io.BufferedInputStream; 020import java.io.ByteArrayInputStream; 021import java.io.ByteArrayOutputStream; 022import java.io.DataOutputStream; 023import java.io.IOException; 024import java.io.InputStream; 025import org.apache.commons.io.IOUtils; 026import org.apache.commons.io.input.BoundedInputStream; 027import org.apache.log4j.Logger; 028import org.archive.io.arc.ARCReader; 029import org.archive.io.arc.ARCReaderFactory; 030import org.archive.io.arc.ARCRecord; 031import org.archive.io.arc.ARCRecordMetaData; 032 033/** 034 * Utilities for working with {@code ARCRecord}s (from archive.org APIs). 035 */ 036public final class ArcRecordUtils { 037 038 /** 039 * Utility classes should not have a public or default constructor. 040 */ 041 private ArcRecordUtils() { 042 } 043 044 /** 045 * Setup logger. 046 */ 047 private static final Logger LOG = Logger.getLogger(ArcRecordUtils.class); 048 049 /** 050 * Converts raw bytes into an {@code ARCRecord}. 051 * 052 * @param bytes raw bytes 053 * @return parsed {@code ARCRecord} 054 * @throws IOException if there is an issue 055 */ 056 public static ARCRecord fromBytes(final byte[] bytes) throws IOException { 057 ARCReader reader = (ARCReader) ARCReaderFactory.get("", 058 new BufferedInputStream(new ByteArrayInputStream(bytes)), false); 059 return (ARCRecord) reader.get(); 060 } 061 062 /** 063 * Converts ARC record into raw bytes. 064 * 065 * @param record conents of WARC response record 066 * @return raw contents 067 * @throws IOException if there is an issue 068 */ 069 public static byte[] toBytes(final ARCRecord record) throws IOException { 070 ARCRecordMetaData meta = record.getMetaData(); 071 072 String metaline = meta.getUrl() + " " + meta.getIp() 073 + " " + meta.getDate() + " " + meta.getMimetype() 074 + " " + (int) meta.getLength(); 075 String versionEtc = ""; 076 077 078 if (meta.getOffset() == 0) { 079 versionEtc = "\n" + meta.getVersion().replace(".", " ") 080 + " " + meta.getOrigin() + "\n" 081 + "URL IP-address Archive-date Content-type Archive-length"; 082 metaline += versionEtc; 083 } 084 085 ByteArrayOutputStream baos = new ByteArrayOutputStream(); 086 DataOutputStream dout = new DataOutputStream(baos); 087 dout.write(metaline.getBytes()); 088 dout.write("\n".getBytes()); 089 090 long recordLength = meta.getLength() - versionEtc.length(); 091 long len = IOUtils.copyLarge(new BoundedInputStream(record, recordLength), 092 dout); 093 if (len != recordLength) { 094 LOG.error("Read " + len + " bytes but expected " + recordLength 095 + " bytes. Continuing..."); 096 } 097 return baos.toByteArray(); 098 } 099 100 /** 101 * Extracts raw contents from an {@code ARCRecord} (including HTTP headers). 102 * 103 * @param record the {@code ARCRecord} 104 * @return raw contents 105 * @throws IOException if there is an issue 106 */ 107 public static byte[] getContent(final ARCRecord record) throws IOException { 108 ARCRecordMetaData meta = record.getMetaData(); 109 String versionEtc = ""; 110 111 if (meta.getOffset() == 0) { 112 versionEtc = "\n" + meta.getVersion().replace(".", " ") 113 + " " + meta.getOrigin() + "\n" 114 + "URL IP-address Archive-date Content-type Archive-length"; 115 } 116 117 try { 118 return copyToByteArray(record, (int) meta.getLength() 119 - versionEtc.length(), true); 120 } catch (Exception e) { 121 // Catch exceptions related to any corrupt archive files. 122 return new byte[0]; 123 } 124 } 125 126 /** 127 * Extracts contents of the body from an {@code ARCRecord}. 128 * Excludes HTTP headers. 129 * 130 * @param record the {@code ARCRecord} 131 * @return contents of the body 132 * @throws IOException if there is an issue 133 */ 134 public static byte[] getBodyContent(final ARCRecord record) 135 throws IOException { 136 byte[] raw = getContent(record); 137 int bodyOffset = record.getBodyOffset(); 138 139 byte[] content = null; 140 try { 141 content = new byte[raw.length - bodyOffset]; 142 System.arraycopy(raw, bodyOffset, content, 0, content.length); 143 } catch (java.lang.NegativeArraySizeException e) { 144 // To find out what URL causing the error: record.getMetaData().getUrl() 145 // For some records, we're missing the actual content data, likely due 146 // to a crawler gitch. Nothing much we can do, just swallow and move on. 147 content = new byte[0]; 148 } 149 return content; 150 } 151 152 /** 153 * Copies contents to a byte array. 154 * 155 * @param is raw input stream 156 * @param recordLength is length of a record 157 * @param enforceLength enforce the length 158 * @return rawContents of body 159 * @throws IOException if there is an issue 160 */ 161 private static byte[] copyToByteArray(final InputStream is, 162 final int recordLength, final boolean enforceLength) 163 throws IOException { 164 165 BoundedInputStream bis = new BoundedInputStream(is, recordLength); 166 byte[] rawContents = IOUtils.toByteArray(bis); 167 if (enforceLength && rawContents.length != recordLength) { 168 LOG.error("Read " + rawContents.length + " bytes but expected " 169 + recordLength + " bytes. Continuing..."); 170 } 171 return rawContents; 172 } 173}