001/* 002 * Copyright © 2017 The Archives Unleashed Project 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package io.archivesunleashed.data; 017 018import java.io.BufferedInputStream; 019import java.io.ByteArrayInputStream; 020import java.io.ByteArrayOutputStream; 021import java.io.DataOutputStream; 022import java.io.IOException; 023import java.io.InputStream; 024import java.util.Map; 025import java.util.regex.Matcher; 026import java.util.regex.Pattern; 027import org.apache.commons.httpclient.HttpParser; 028import org.apache.commons.io.IOUtils; 029import org.apache.commons.io.input.BoundedInputStream; 030import org.apache.log4j.Logger; 031import org.archive.format.warc.WARCConstants; 032import org.archive.io.warc.WARCReader; 033import org.archive.io.warc.WARCReaderFactory; 034import org.archive.io.warc.WARCRecord; 035 036/** Utilities for working with {@code WARCRecord}s (from archive.org APIs). */ 037public final class WarcRecordUtils implements WARCConstants { 038 039 /** Utility classes should not have a public or default constructor. */ 040 private WarcRecordUtils() {} 041 042 /** Setup logger. */ 043 private static final Logger LOG = Logger.getLogger(WarcRecordUtils.class); 044 045 /** 046 * Converts raw bytes into an {@code WARCRecord}. 047 * 048 * @param bytes raw bytes 049 * @return parsed {@code WARCRecord} 050 * @throws IOException if there is an issue 051 */ 052 public static WARCRecord fromBytes(final byte[] bytes) throws IOException { 053 WARCReader reader = 054 (WARCReader) 055 WARCReaderFactory.get( 056 "", new BufferedInputStream(new ByteArrayInputStream(bytes)), false); 057 return (WARCRecord) reader.get(); 058 } 059 060 /** 061 * Converts WARC record into raw bytes. 062 * 063 * @param record conents of WARC response record 064 * @return raw contents 065 * @throws IOException if there is an issue 066 */ 067 public static byte[] toBytes(final WARCRecord record) throws IOException { 068 ByteArrayOutputStream baos = new ByteArrayOutputStream(); 069 DataOutputStream dout = new DataOutputStream(baos); 070 071 dout.write("WARC/0.17\n".getBytes()); 072 for (Map.Entry<String, Object> entry : record.getHeader().getHeaderFields().entrySet()) { 073 dout.write((entry.getKey() + ": " + entry.getValue().toString() + "\n").getBytes()); 074 } 075 dout.write("\n".getBytes()); 076 record.dump(dout); 077 078 return baos.toByteArray(); 079 } 080 081 /** 082 * Extracts the MIME type of WARC response records. "WARC-Type" is "response". Note that this is 083 * different from the "Content-Type" in the WARC header. 084 * 085 * @param contents raw contents of the WARC response record 086 * @return MIME type 087 */ 088 public static String getWarcResponseMimeType(final byte[] contents) { 089 // This is a somewhat janky way to get the MIME type of the response. 090 // Moreover, this simple regex is not compliant with the specification. 091 // See: https://www.w3.org/Protocols/rfc1341/4_Content-Type.html 092 // It would be much better to parse all headers using an external library: 093 // org.apache.commons.httpclient.HeaderElement 094 // Note that this is different from the "Content-Type" in the WARC header. 095 Pattern pattern = Pattern.compile("Content-Type: ([^\\s;]+) *(;.*)?", Pattern.CASE_INSENSITIVE); 096 Matcher matcher = pattern.matcher(new String(contents)); 097 if (matcher.find()) { 098 return matcher.group(1).replaceAll(";$", ""); 099 } 100 101 return null; 102 } 103 104 /** 105 * Extracts raw contents from a {@code WARCRecord} (including HTTP headers). 106 * 107 * @param record the {@code WARCRecord} 108 * @return raw contents 109 * @throws IOException if there is an issue 110 */ 111 public static byte[] getContent(final WARCRecord record) throws IOException { 112 int len = (int) record.getHeader().getContentLength(); 113 114 // If we have a corrupt record, quit and move on. 115 if (len < 0) { 116 return new byte[0]; 117 } 118 119 try { 120 return copyToByteArray(record, len, true); 121 } catch (Exception e) { 122 // Catch exceptions related to any corrupt archive files. 123 return new byte[0]; 124 } 125 } 126 127 /** 128 * Extracts contents of the body from a {@code WARCRecord}. Excludes HTTP headers. 129 * 130 * @param record the {@code WARCRecord} 131 * @return contents of the body 132 * @throws IOException if there is an issue 133 */ 134 public static byte[] getBodyContent(final WARCRecord record) throws IOException { 135 ByteArrayOutputStream baos = new ByteArrayOutputStream(); 136 String line = HttpParser.readLine(record, WARC_HEADER_ENCODING); 137 if (line == null) { 138 return null; 139 } 140 141 // Just using parseHeaders to move down input stream to body. 142 HttpParser.parseHeaders(record, WARC_HEADER_ENCODING); 143 record.dump(baos); 144 return baos.toByteArray(); 145 } 146 147 /** 148 * Copies contents to a byte array. 149 * 150 * @param is raw input stream 151 * @param recordLength length of a record 152 * @param enforceLength enforce the length 153 * @return rawContents of body 154 * @throws IOException if there is an issue 155 */ 156 private static byte[] copyToByteArray( 157 final InputStream is, final int recordLength, final boolean enforceLength) 158 throws IOException { 159 160 BoundedInputStream bis = new BoundedInputStream(is, recordLength); 161 byte[] rawContents = IOUtils.toByteArray(bis); 162 if (enforceLength && rawContents.length != recordLength) { 163 LOG.error( 164 "Read " 165 + rawContents.length 166 + " bytes but expected " 167 + recordLength 168 + " bytes. Continuing..."); 169 } 170 return rawContents; 171 } 172}