Source code

001/*
002 * Archives Unleashed Toolkit (AUT):
003 * An open-source platform for analyzing web archives.
004 *
005 * Licensed under the Apache License, Version 2.0 (the "License");
006 * you may not use this file except in compliance with the License.
007 * You may obtain a copy of the License at
008 *
009 *     http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package io.archivesunleashed.data;
018
019import java.io.BufferedInputStream;
020import java.io.ByteArrayInputStream;
021import java.io.ByteArrayOutputStream;
022import java.io.DataOutputStream;
023import java.io.IOException;
024import java.io.InputStream;
025import java.util.Map;
026import java.util.regex.Matcher;
027import java.util.regex.Pattern;
028import org.apache.commons.httpclient.HttpParser;
029import org.apache.commons.io.IOUtils;
030import org.apache.commons.io.input.BoundedInputStream;
031import org.apache.log4j.Logger;
032import org.archive.io.warc.WARCConstants;
033import org.archive.io.warc.WARCReader;
034import org.archive.io.warc.WARCReaderFactory;
035import org.archive.io.warc.WARCRecord;
036
037/**
038 * Utilities for working with {@code WARCRecord}s (from archive.org APIs).
039 */
040public final class WarcRecordUtils implements WARCConstants {
041
042  /**
043   * Utility classes should not have a public or default constructor.
044   */
045  private WarcRecordUtils() {
046  }
047
048  /**
049  * Setup logger.
050  */
051  private static final Logger LOG = Logger.getLogger(WarcRecordUtils.class);
052
053  /**
054   * Converts raw bytes into an {@code WARCRecord}.
055   *
056   * @param bytes raw bytes
057   * @return parsed {@code WARCRecord}
058   * @throws IOException if there is an issue
059   */
060  public static WARCRecord fromBytes(final byte[] bytes) throws IOException {
061    WARCReader reader = (WARCReader) WARCReaderFactory.get("",
062        new BufferedInputStream(new ByteArrayInputStream(bytes)), false);
063    return (WARCRecord) reader.get();
064  }
065
066  /**
067   * Converts WARC record into raw bytes.
068   *
069   * @param record conents of WARC response record
070   * @return raw contents
071   * @throws IOException if there is an issue
072   */
073  public static byte[] toBytes(final WARCRecord record) throws IOException {
074    ByteArrayOutputStream baos = new ByteArrayOutputStream();
075    DataOutputStream dout = new DataOutputStream(baos);
076
077    dout.write("WARC/0.17\n".getBytes());
078    for (Map.Entry<String, Object> entry : record.getHeader()
079            .getHeaderFields().entrySet()) {
080      dout.write((entry.getKey() + ": " + entry.getValue().toString() + "\n")
081              .getBytes());
082    }
083    dout.write("\n".getBytes());
084    record.dump(dout);
085
086    return baos.toByteArray();
087  }
088
089  /**
090   * Extracts the MIME type of WARC response records.
091   * "WARC-Type" is "response".
092   * Note that this is different from the "Content-Type" in the WARC header.
093   *
094   * @param contents raw contents of the WARC response record
095   * @return MIME type
096   */
097  public static String getWarcResponseMimeType(final byte[] contents) {
098    // This is a somewhat janky way to get the MIME type of the response.
099    // Note that this is different from the "Content-Type" in the WARC header.
100    Pattern pattern = Pattern.compile("Content-Type: ([^\\s]+)",
101            Pattern.CASE_INSENSITIVE);
102    Matcher matcher = pattern.matcher(new String(contents));
103    if (matcher.find()) {
104      return matcher.group(1).replaceAll(";$", "");
105    }
106
107    return null;
108  }
109
110  /**
111   * Extracts raw contents from a {@code WARCRecord} (including HTTP headers).
112   *
113   * @param record the {@code WARCRecord}
114   * @return raw contents
115   * @throws IOException if there is an issue
116   */
117  public static byte[] getContent(final WARCRecord record) throws IOException {
118    int len = (int) record.getHeader().getContentLength();
119
120    // If we have a corrupt record, quit and move on.
121    if (len < 0) {
122      return new byte[0];
123    }
124
125    try {
126      return copyToByteArray(record, len, true);
127    } catch (Exception e) {
128      // Catch exceptions related to any corrupt archive files.
129      return new byte[0];
130    }
131  }
132
133  /**
134   * Extracts contents of the body from a {@code WARCRecord}.
135   * Excludes HTTP headers.
136   *
137   * @param record the {@code WARCRecord}
138   * @return contents of the body
139   * @throws IOException if there is an issue
140   */
141  public static byte[] getBodyContent(final WARCRecord record)
142      throws IOException {
143    ByteArrayOutputStream baos = new ByteArrayOutputStream();
144    String line = HttpParser.readLine(record, WARC_HEADER_ENCODING);
145    if (line == null) {
146      return null;
147    }
148
149    // Just using parseHeaders to move down input stream to body
150    HttpParser.parseHeaders(record, WARC_HEADER_ENCODING);
151    record.dump(baos);
152    return baos.toByteArray();
153  }
154
155  /**
156   * Copies contents to a byte array.
157   *
158   * @param is raw input stream
159   * @param recordLength length of a record
160   * @param enforceLength enforce the length
161   * @return rawContents of body
162   * @throws IOException if there is an issue
163   */
164  private static byte[] copyToByteArray(final InputStream is,
165          final int recordLength, final boolean enforceLength)
166      throws IOException {
167
168    BoundedInputStream bis = new BoundedInputStream(is, recordLength);
169    byte[] rawContents = IOUtils.toByteArray(bis);
170    if (enforceLength && rawContents.length != recordLength) {
171      LOG.error("Read " + rawContents.length + " bytes but expected "
172              + recordLength + " bytes. Continuing...");
173    }
174    return rawContents;
175  }
176}