001/* 002 * Archives Unleashed Toolkit (AUT): 003 * An open-source platform for analyzing web archives. 004 * 005 * Licensed under the Apache License, Version 2.0 (the "License"); 006 * you may not use this file except in compliance with the License. 007 * You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package io.archivesunleashed.data; 018 019import java.io.BufferedInputStream; 020import java.io.ByteArrayInputStream; 021import java.io.ByteArrayOutputStream; 022import java.io.DataOutputStream; 023import java.io.IOException; 024import java.io.InputStream; 025import org.apache.commons.io.IOUtils; 026import org.apache.commons.io.input.BoundedInputStream; 027import org.apache.log4j.Logger; 028import org.archive.io.arc.ARCReader; 029import org.archive.io.arc.ARCReaderFactory; 030import org.archive.io.arc.ARCRecord; 031import org.archive.io.arc.ARCRecordMetaData; 032 033/** 034 * Utilities for working with {@code ARCRecord}s (from archive.org APIs). 035 */ 036public final class ArcRecordUtils { 037 038 /** 039 * Utility classes should not have a public or default constructor. 040 */ 041 private ArcRecordUtils() { 042 } 043 044 /** 045 * Setup logger. 046 */ 047 private static final Logger LOG = Logger.getLogger(ArcRecordUtils.class); 048 049 /** 050 * Converts raw bytes into an {@code ARCRecord}. 051 * 052 * @param bytes raw bytes 053 * @return parsed {@code ARCRecord} 054 * @throws IOException if there is an issue 055 */ 056 public static ARCRecord fromBytes(final byte[] bytes) throws IOException { 057 ARCReader reader = (ARCReader) ARCReaderFactory.get("", 058 new BufferedInputStream(new ByteArrayInputStream(bytes)), false); 059 return (ARCRecord) reader.get(); 060 } 061 062 /** 063 * Converts ARC record into raw bytes. 064 * 065 * @param record conents of WARC response record 066 * @return raw contents 067 * @throws IOException if there is an issue 068 */ 069 public static byte[] toBytes(final ARCRecord record) throws IOException { 070 ARCRecordMetaData meta = record.getMetaData(); 071 072 String metaline = meta.getUrl() + " " + meta.getIp() 073 + " " + meta.getDate() + " " + meta.getMimetype() 074 + " " + (int) meta.getLength(); 075 String versionEtc = ""; 076 077 078 if (meta.getOffset() == 0) { 079 versionEtc = "\n" + meta.getVersion().replace(".", " ") 080 + " " + meta.getOrigin() + "\n" 081 + "URL IP-address Archive-date Content-type Archive-length"; 082 metaline += versionEtc; 083 } 084 085 ByteArrayOutputStream baos = new ByteArrayOutputStream(); 086 DataOutputStream dout = new DataOutputStream(baos); 087 dout.write(metaline.getBytes()); 088 dout.write("\n".getBytes()); 089 090 long recordLength = meta.getLength() - versionEtc.length(); 091 long len = IOUtils.copyLarge(new BoundedInputStream(record, recordLength), 092 dout); 093 if (len != recordLength) { 094 LOG.error("Read " + len + " bytes but expected " + recordLength 095 + " bytes. Continuing..."); 096 } 097 return baos.toByteArray(); 098 } 099 100 /** 101 * Extracts raw contents from an {@code ARCRecord} (including HTTP headers). 102 * 103 * @param record the {@code ARCRecord} 104 * @return raw contents 105 * @throws IOException if there is an issue 106 */ 107 public static byte[] getContent(final ARCRecord record) throws IOException { 108 ARCRecordMetaData meta = record.getMetaData(); 109 String versionEtc = ""; 110 111 if (meta.getOffset() == 0) { 112 versionEtc = "\n" + meta.getVersion().replace(".", " ") 113 + " " + meta.getOrigin() + "\n" 114 + "URL IP-address Archive-date Content-type Archive-length"; 115 } 116 117 return copyToByteArray(record, (int) meta.getLength() 118 - versionEtc.length(), true); 119 } 120 121 /** 122 * Extracts contents of the body from an {@code ARCRecord}. 123 * Excludes HTTP headers. 124 * 125 * @param record the {@code ARCRecord} 126 * @return contents of the body 127 * @throws IOException if there is an issue 128 */ 129 public static byte[] getBodyContent(final ARCRecord record) 130 throws IOException { 131 byte[] raw = getContent(record); 132 int bodyOffset = record.getBodyOffset(); 133 134 byte[] content = null; 135 try { 136 content = new byte[raw.length - bodyOffset]; 137 System.arraycopy(raw, bodyOffset, content, 0, content.length); 138 } catch (java.lang.NegativeArraySizeException e) { 139 // To find out what URL causing the error: record.getMetaData().getUrl() 140 // For some records, we're missing the actual content data, likely due 141 // to a crawler gitch. Nothing much we can do, just swallow and move on. 142 content = new byte[0]; 143 } 144 return content; 145 } 146 147 /** 148 * Copies contents to a byte array. 149 * 150 * @param is raw input stream 151 * @param recordLength is length of a record 152 * @param enforceLength enforce the length 153 * @return rawContents of body 154 * @throws IOException if there is an issue 155 */ 156 private static byte[] copyToByteArray(final InputStream is, 157 final int recordLength, final boolean enforceLength) 158 throws IOException { 159 160 BoundedInputStream bis = new BoundedInputStream(is, recordLength); 161 byte[] rawContents = IOUtils.toByteArray(bis); 162 if (enforceLength && rawContents.length != recordLength) { 163 LOG.error("Read " + rawContents.length + " bytes but expected " 164 + recordLength + " bytes. Continuing..."); 165 } 166 return rawContents; 167 } 168}