001/* 002 * Copyright © 2017 The Archives Unleashed Project 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package io.archivesunleashed.data; 017 018import java.io.BufferedInputStream; 019import java.io.ByteArrayInputStream; 020import java.io.ByteArrayOutputStream; 021import java.io.DataOutputStream; 022import java.io.IOException; 023import java.io.InputStream; 024import java.util.Map; 025import java.util.regex.Matcher; 026import java.util.regex.Pattern; 027import org.apache.commons.httpclient.HttpParser; 028import org.apache.commons.io.IOUtils; 029import org.apache.commons.io.input.BoundedInputStream; 030import org.apache.log4j.Logger; 031import org.archive.format.warc.WARCConstants; 032import org.archive.io.warc.WARCReader; 033import org.archive.io.warc.WARCReaderFactory; 034import org.archive.io.warc.WARCRecord; 035 036/** 037 * Utilities for working with {@code WARCRecord}s (from archive.org APIs). 038 */ 039public final class WarcRecordUtils implements WARCConstants { 040 041 /** 042 * Utility classes should not have a public or default constructor. 043 */ 044 private WarcRecordUtils() { 045 } 046 047 /** 048 * Setup logger. 049 */ 050 private static final Logger LOG = Logger.getLogger(WarcRecordUtils.class); 051 052 /** 053 * Converts raw bytes into an {@code WARCRecord}. 054 * 055 * @param bytes raw bytes 056 * @return parsed {@code WARCRecord} 057 * @throws IOException if there is an issue 058 */ 059 public static WARCRecord fromBytes(final byte[] bytes) throws IOException { 060 WARCReader reader = (WARCReader) WARCReaderFactory.get("", 061 new BufferedInputStream(new ByteArrayInputStream(bytes)), false); 062 return (WARCRecord) reader.get(); 063 } 064 065 /** 066 * Converts WARC record into raw bytes. 067 * 068 * @param record conents of WARC response record 069 * @return raw contents 070 * @throws IOException if there is an issue 071 */ 072 public static byte[] toBytes(final WARCRecord record) throws IOException { 073 ByteArrayOutputStream baos = new ByteArrayOutputStream(); 074 DataOutputStream dout = new DataOutputStream(baos); 075 076 dout.write("WARC/0.17\n".getBytes()); 077 for (Map.Entry<String, Object> entry : record.getHeader() 078 .getHeaderFields().entrySet()) { 079 dout.write((entry.getKey() + ": " + entry.getValue().toString() + "\n") 080 .getBytes()); 081 } 082 dout.write("\n".getBytes()); 083 record.dump(dout); 084 085 return baos.toByteArray(); 086 } 087 088 /** 089 * Extracts the MIME type of WARC response records. 090 * "WARC-Type" is "response". 091 * Note that this is different from the "Content-Type" in the WARC header. 092 * 093 * @param contents raw contents of the WARC response record 094 * @return MIME type 095 */ 096 public static String getWarcResponseMimeType(final byte[] contents) { 097 // This is a somewhat janky way to get the MIME type of the response. 098 // Moreover, this simple regex is not compliant with the specification. 099 // See: https://www.w3.org/Protocols/rfc1341/4_Content-Type.html 100 // It would be much better to parse all headers using an external library: 101 // org.apache.commons.httpclient.HeaderElement 102 // Note that this is different from the "Content-Type" in the WARC header. 103 Pattern pattern = Pattern.compile("Content-Type: ([^\\s;]+) *(;.*)?", 104 Pattern.CASE_INSENSITIVE); 105 Matcher matcher = pattern.matcher(new String(contents)); 106 if (matcher.find()) { 107 return matcher.group(1).replaceAll(";$", ""); 108 } 109 110 return null; 111 } 112 113 /** 114 * Extracts raw contents from a {@code WARCRecord} (including HTTP headers). 115 * 116 * @param record the {@code WARCRecord} 117 * @return raw contents 118 * @throws IOException if there is an issue 119 */ 120 public static byte[] getContent(final WARCRecord record) throws IOException { 121 int len = (int) record.getHeader().getContentLength(); 122 123 // If we have a corrupt record, quit and move on. 124 if (len < 0) { 125 return new byte[0]; 126 } 127 128 try { 129 return copyToByteArray(record, len, true); 130 } catch (Exception e) { 131 // Catch exceptions related to any corrupt archive files. 132 return new byte[0]; 133 } 134 } 135 136 /** 137 * Extracts contents of the body from a {@code WARCRecord}. 138 * Excludes HTTP headers. 139 * 140 * @param record the {@code WARCRecord} 141 * @return contents of the body 142 * @throws IOException if there is an issue 143 */ 144 public static byte[] getBodyContent(final WARCRecord record) 145 throws IOException { 146 ByteArrayOutputStream baos = new ByteArrayOutputStream(); 147 String line = HttpParser.readLine(record, WARC_HEADER_ENCODING); 148 if (line == null) { 149 return null; 150 } 151 152 // Just using parseHeaders to move down input stream to body. 153 HttpParser.parseHeaders(record, WARC_HEADER_ENCODING); 154 record.dump(baos); 155 return baos.toByteArray(); 156 } 157 158 /** 159 * Copies contents to a byte array. 160 * 161 * @param is raw input stream 162 * @param recordLength length of a record 163 * @param enforceLength enforce the length 164 * @return rawContents of body 165 * @throws IOException if there is an issue 166 */ 167 private static byte[] copyToByteArray(final InputStream is, 168 final int recordLength, final boolean enforceLength) 169 throws IOException { 170 171 BoundedInputStream bis = new BoundedInputStream(is, recordLength); 172 byte[] rawContents = IOUtils.toByteArray(bis); 173 if (enforceLength && rawContents.length != recordLength) { 174 LOG.error("Read " + rawContents.length + " bytes but expected " 175 + recordLength + " bytes. Continuing..."); 176 } 177 return rawContents; 178 } 179}