001/* 002 * Archives Unleashed Toolkit (AUT): 003 * An open-source platform for analyzing web archives. 004 * 005 * Licensed under the Apache License, Version 2.0 (the "License"); 006 * you may not use this file except in compliance with the License. 007 * You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package io.archivesunleashed.data; 018 019import java.io.BufferedInputStream; 020import java.io.ByteArrayInputStream; 021import java.io.ByteArrayOutputStream; 022import java.io.DataOutputStream; 023import java.io.IOException; 024import java.io.InputStream; 025import java.util.Map; 026import java.util.regex.Matcher; 027import java.util.regex.Pattern; 028import org.apache.commons.httpclient.HttpParser; 029import org.apache.commons.io.IOUtils; 030import org.apache.commons.io.input.BoundedInputStream; 031import org.apache.log4j.Logger; 032import org.archive.io.warc.WARCConstants; 033import org.archive.io.warc.WARCReader; 034import org.archive.io.warc.WARCReaderFactory; 035import org.archive.io.warc.WARCRecord; 036 037/** 038 * Utilities for working with {@code WARCRecord}s (from archive.org APIs). 039 */ 040public final class WarcRecordUtils implements WARCConstants { 041 042 /** 043 * Utility classes should not have a public or default constructor. 044 */ 045 private WarcRecordUtils() { 046 } 047 048 /** 049 * Setup logger. 050 */ 051 private static final Logger LOG = Logger.getLogger(WarcRecordUtils.class); 052 053 /** 054 * Converts raw bytes into an {@code WARCRecord}. 055 * 056 * @param bytes raw bytes 057 * @return parsed {@code WARCRecord} 058 * @throws IOException if there is an issue 059 */ 060 public static WARCRecord fromBytes(final byte[] bytes) throws IOException { 061 WARCReader reader = (WARCReader) WARCReaderFactory.get("", 062 new BufferedInputStream(new ByteArrayInputStream(bytes)), false); 063 return (WARCRecord) reader.get(); 064 } 065 066 /** 067 * Converts WARC record into raw bytes. 068 * 069 * @param record conents of WARC response record 070 * @return raw contents 071 * @throws IOException if there is an issue 072 */ 073 public static byte[] toBytes(final WARCRecord record) throws IOException { 074 ByteArrayOutputStream baos = new ByteArrayOutputStream(); 075 DataOutputStream dout = new DataOutputStream(baos); 076 077 dout.write("WARC/0.17\n".getBytes()); 078 for (Map.Entry<String, Object> entry : record.getHeader() 079 .getHeaderFields().entrySet()) { 080 dout.write((entry.getKey() + ": " + entry.getValue().toString() + "\n") 081 .getBytes()); 082 } 083 dout.write("\n".getBytes()); 084 record.dump(dout); 085 086 return baos.toByteArray(); 087 } 088 089 /** 090 * Extracts the MIME type of WARC response records. 091 * "WARC-Type" is "response". 092 * Note that this is different from the "Content-Type" in the WARC header. 093 * 094 * @param contents raw contents of the WARC response record 095 * @return MIME type 096 */ 097 public static String getWarcResponseMimeType(final byte[] contents) { 098 // This is a somewhat janky way to get the MIME type of the response. 099 // Note that this is different from the "Content-Type" in the WARC header. 100 Pattern pattern = Pattern.compile("Content-Type: ([^\\s]+)", 101 Pattern.CASE_INSENSITIVE); 102 Matcher matcher = pattern.matcher(new String(contents)); 103 if (matcher.find()) { 104 return matcher.group(1).replaceAll(";$", ""); 105 } 106 107 return null; 108 } 109 110 /** 111 * Extracts raw contents from a {@code WARCRecord} (including HTTP headers). 112 * 113 * @param record the {@code WARCRecord} 114 * @return raw contents 115 * @throws IOException if there is an issue 116 */ 117 public static byte[] getContent(final WARCRecord record) throws IOException { 118 int len = (int) record.getHeader().getContentLength(); 119 120 // If we have a corrupt record, quit and move on. 121 if (len < 0) { 122 return new byte[0]; 123 } 124 125 try { 126 return copyToByteArray(record, len, true); 127 } catch (Exception e) { 128 // Catch exceptions related to any corrupt archive files. 129 return new byte[0]; 130 } 131 } 132 133 /** 134 * Extracts contents of the body from a {@code WARCRecord}. 135 * Excludes HTTP headers. 136 * 137 * @param record the {@code WARCRecord} 138 * @return contents of the body 139 * @throws IOException if there is an issue 140 */ 141 public static byte[] getBodyContent(final WARCRecord record) 142 throws IOException { 143 ByteArrayOutputStream baos = new ByteArrayOutputStream(); 144 String line = HttpParser.readLine(record, WARC_HEADER_ENCODING); 145 if (line == null) { 146 return null; 147 } 148 149 // Just using parseHeaders to move down input stream to body 150 HttpParser.parseHeaders(record, WARC_HEADER_ENCODING); 151 record.dump(baos); 152 return baos.toByteArray(); 153 } 154 155 /** 156 * Copies contents to a byte array. 157 * 158 * @param is raw input stream 159 * @param recordLength length of a record 160 * @param enforceLength enforce the length 161 * @return rawContents of body 162 * @throws IOException if there is an issue 163 */ 164 private static byte[] copyToByteArray(final InputStream is, 165 final int recordLength, final boolean enforceLength) 166 throws IOException { 167 168 BoundedInputStream bis = new BoundedInputStream(is, recordLength); 169 byte[] rawContents = IOUtils.toByteArray(bis); 170 if (enforceLength && rawContents.length != recordLength) { 171 LOG.error("Read " + rawContents.length + " bytes but expected " 172 + recordLength + " bytes. Continuing..."); 173 } 174 return rawContents; 175 } 176}