001/* 002 * Archives Unleashed Toolkit (AUT): 003 * An open-source platform for analyzing web archives. 004 * 005 * Licensed under the Apache License, Version 2.0 (the "License"); 006 * you may not use this file except in compliance with the License. 007 * You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package io.archivesunleashed.data; 018 019import java.io.BufferedInputStream; 020import java.io.ByteArrayInputStream; 021import java.io.ByteArrayOutputStream; 022import java.io.DataOutputStream; 023import java.io.IOException; 024import java.io.InputStream; 025import java.util.Map; 026import java.util.regex.Matcher; 027import java.util.regex.Pattern; 028import org.apache.commons.httpclient.HttpParser; 029import org.apache.commons.io.IOUtils; 030import org.apache.commons.io.input.BoundedInputStream; 031import org.apache.log4j.Logger; 032import org.archive.format.warc.WARCConstants; 033import org.archive.io.warc.WARCReader; 034import org.archive.io.warc.WARCReaderFactory; 035import org.archive.io.warc.WARCRecord; 036 037/** 038 * Utilities for working with {@code WARCRecord}s (from archive.org APIs). 039 */ 040public final class WarcRecordUtils implements WARCConstants { 041 042 /** 043 * Utility classes should not have a public or default constructor. 044 */ 045 private WarcRecordUtils() { 046 } 047 048 /** 049 * Setup logger. 050 */ 051 private static final Logger LOG = Logger.getLogger(WarcRecordUtils.class); 052 053 /** 054 * Converts raw bytes into an {@code WARCRecord}. 055 * 056 * @param bytes raw bytes 057 * @return parsed {@code WARCRecord} 058 * @throws IOException if there is an issue 059 */ 060 public static WARCRecord fromBytes(final byte[] bytes) throws IOException { 061 WARCReader reader = (WARCReader) WARCReaderFactory.get("", 062 new BufferedInputStream(new ByteArrayInputStream(bytes)), false); 063 return (WARCRecord) reader.get(); 064 } 065 066 /** 067 * Converts WARC record into raw bytes. 068 * 069 * @param record conents of WARC response record 070 * @return raw contents 071 * @throws IOException if there is an issue 072 */ 073 public static byte[] toBytes(final WARCRecord record) throws IOException { 074 ByteArrayOutputStream baos = new ByteArrayOutputStream(); 075 DataOutputStream dout = new DataOutputStream(baos); 076 077 dout.write("WARC/0.17\n".getBytes()); 078 for (Map.Entry<String, Object> entry : record.getHeader() 079 .getHeaderFields().entrySet()) { 080 dout.write((entry.getKey() + ": " + entry.getValue().toString() + "\n") 081 .getBytes()); 082 } 083 dout.write("\n".getBytes()); 084 record.dump(dout); 085 086 return baos.toByteArray(); 087 } 088 089 /** 090 * Extracts the MIME type of WARC response records. 091 * "WARC-Type" is "response". 092 * Note that this is different from the "Content-Type" in the WARC header. 093 * 094 * @param contents raw contents of the WARC response record 095 * @return MIME type 096 */ 097 public static String getWarcResponseMimeType(final byte[] contents) { 098 // This is a somewhat janky way to get the MIME type of the response. 099 // Moreover, this simple regex is not compliant with the specification. 100 // See: https://www.w3.org/Protocols/rfc1341/4_Content-Type.html 101 // It would be much better to parse all headers using an external library: 102 // org.apache.commons.httpclient.HeaderElement 103 // Note that this is different from the "Content-Type" in the WARC header. 104 Pattern pattern = Pattern.compile("Content-Type: ([^\\s;]+) *(;.*)?", 105 Pattern.CASE_INSENSITIVE); 106 Matcher matcher = pattern.matcher(new String(contents)); 107 if (matcher.find()) { 108 return matcher.group(1).replaceAll(";$", ""); 109 } 110 111 return null; 112 } 113 114 /** 115 * Extracts raw contents from a {@code WARCRecord} (including HTTP headers). 116 * 117 * @param record the {@code WARCRecord} 118 * @return raw contents 119 * @throws IOException if there is an issue 120 */ 121 public static byte[] getContent(final WARCRecord record) throws IOException { 122 int len = (int) record.getHeader().getContentLength(); 123 124 // If we have a corrupt record, quit and move on. 125 if (len < 0) { 126 return new byte[0]; 127 } 128 129 try { 130 return copyToByteArray(record, len, true); 131 } catch (Exception e) { 132 // Catch exceptions related to any corrupt archive files. 133 return new byte[0]; 134 } 135 } 136 137 /** 138 * Extracts contents of the body from a {@code WARCRecord}. 139 * Excludes HTTP headers. 140 * 141 * @param record the {@code WARCRecord} 142 * @return contents of the body 143 * @throws IOException if there is an issue 144 */ 145 public static byte[] getBodyContent(final WARCRecord record) 146 throws IOException { 147 ByteArrayOutputStream baos = new ByteArrayOutputStream(); 148 String line = HttpParser.readLine(record, WARC_HEADER_ENCODING); 149 if (line == null) { 150 return null; 151 } 152 153 // Just using parseHeaders to move down input stream to body 154 HttpParser.parseHeaders(record, WARC_HEADER_ENCODING); 155 record.dump(baos); 156 return baos.toByteArray(); 157 } 158 159 /** 160 * Copies contents to a byte array. 161 * 162 * @param is raw input stream 163 * @param recordLength length of a record 164 * @param enforceLength enforce the length 165 * @return rawContents of body 166 * @throws IOException if there is an issue 167 */ 168 private static byte[] copyToByteArray(final InputStream is, 169 final int recordLength, final boolean enforceLength) 170 throws IOException { 171 172 BoundedInputStream bis = new BoundedInputStream(is, recordLength); 173 byte[] rawContents = IOUtils.toByteArray(bis); 174 if (enforceLength && rawContents.length != recordLength) { 175 LOG.error("Read " + rawContents.length + " bytes but expected " 176 + recordLength + " bytes. Continuing..."); 177 } 178 return rawContents; 179 } 180}