001/*
002 * Archives Unleashed Toolkit (AUT):
003 * An open-source platform for analyzing web archives.
004 *
005 * Licensed under the Apache License, Version 2.0 (the "License");
006 * you may not use this file except in compliance with the License.
007 * You may obtain a copy of the License at
008 *
009 *     http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package io.archivesunleashed.data;
018
019import java.io.BufferedInputStream;
020import java.io.ByteArrayInputStream;
021import java.io.ByteArrayOutputStream;
022import java.io.DataOutputStream;
023import java.io.IOException;
024import java.io.InputStream;
025import java.util.Map;
026import java.util.regex.Matcher;
027import java.util.regex.Pattern;
028import org.apache.commons.httpclient.HttpParser;
029import org.apache.commons.io.IOUtils;
030import org.apache.commons.io.input.BoundedInputStream;
031import org.apache.log4j.Logger;
032import org.archive.format.warc.WARCConstants;
033import org.archive.io.warc.WARCReader;
034import org.archive.io.warc.WARCReaderFactory;
035import org.archive.io.warc.WARCRecord;
036
037/**
038 * Utilities for working with {@code WARCRecord}s (from archive.org APIs).
039 */
040public final class WarcRecordUtils implements WARCConstants {
041
042  /**
043   * Utility classes should not have a public or default constructor.
044   */
045  private WarcRecordUtils() {
046  }
047
048  /**
049  * Setup logger.
050  */
051  private static final Logger LOG = Logger.getLogger(WarcRecordUtils.class);
052
053  /**
054   * Converts raw bytes into an {@code WARCRecord}.
055   *
056   * @param bytes raw bytes
057   * @return parsed {@code WARCRecord}
058   * @throws IOException if there is an issue
059   */
060  public static WARCRecord fromBytes(final byte[] bytes) throws IOException {
061    WARCReader reader = (WARCReader) WARCReaderFactory.get("",
062        new BufferedInputStream(new ByteArrayInputStream(bytes)), false);
063    return (WARCRecord) reader.get();
064  }
065
066  /**
067   * Converts WARC record into raw bytes.
068   *
069   * @param record conents of WARC response record
070   * @return raw contents
071   * @throws IOException if there is an issue
072   */
073  public static byte[] toBytes(final WARCRecord record) throws IOException {
074    ByteArrayOutputStream baos = new ByteArrayOutputStream();
075    DataOutputStream dout = new DataOutputStream(baos);
076
077    dout.write("WARC/0.17\n".getBytes());
078    for (Map.Entry<String, Object> entry : record.getHeader()
079            .getHeaderFields().entrySet()) {
080      dout.write((entry.getKey() + ": " + entry.getValue().toString() + "\n")
081              .getBytes());
082    }
083    dout.write("\n".getBytes());
084    record.dump(dout);
085
086    return baos.toByteArray();
087  }
088
089  /**
090   * Extracts the MIME type of WARC response records.
091   * "WARC-Type" is "response".
092   * Note that this is different from the "Content-Type" in the WARC header.
093   *
094   * @param contents raw contents of the WARC response record
095   * @return MIME type
096   */
097  public static String getWarcResponseMimeType(final byte[] contents) {
098    // This is a somewhat janky way to get the MIME type of the response.
099    // Moreover, this simple regex is not compliant with the specification.
100    // See: https://www.w3.org/Protocols/rfc1341/4_Content-Type.html
101    // It would be much better to parse all headers using an external library:
102    //   org.apache.commons.httpclient.HeaderElement
103    // Note that this is different from the "Content-Type" in the WARC header.
104    Pattern pattern = Pattern.compile("Content-Type: ([^\\s;]+) *(;.*)?",
105            Pattern.CASE_INSENSITIVE);
106    Matcher matcher = pattern.matcher(new String(contents));
107    if (matcher.find()) {
108      return matcher.group(1).replaceAll(";$", "");
109    }
110
111    return null;
112  }
113
114  /**
115   * Extracts raw contents from a {@code WARCRecord} (including HTTP headers).
116   *
117   * @param record the {@code WARCRecord}
118   * @return raw contents
119   * @throws IOException if there is an issue
120   */
121  public static byte[] getContent(final WARCRecord record) throws IOException {
122    int len = (int) record.getHeader().getContentLength();
123
124    // If we have a corrupt record, quit and move on.
125    if (len < 0) {
126      return new byte[0];
127    }
128
129    try {
130      return copyToByteArray(record, len, true);
131    } catch (Exception e) {
132      // Catch exceptions related to any corrupt archive files.
133      return new byte[0];
134    }
135  }
136
137  /**
138   * Extracts contents of the body from a {@code WARCRecord}.
139   * Excludes HTTP headers.
140   *
141   * @param record the {@code WARCRecord}
142   * @return contents of the body
143   * @throws IOException if there is an issue
144   */
145  public static byte[] getBodyContent(final WARCRecord record)
146      throws IOException {
147    ByteArrayOutputStream baos = new ByteArrayOutputStream();
148    String line = HttpParser.readLine(record, WARC_HEADER_ENCODING);
149    if (line == null) {
150      return null;
151    }
152
153    // Just using parseHeaders to move down input stream to body
154    HttpParser.parseHeaders(record, WARC_HEADER_ENCODING);
155    record.dump(baos);
156    return baos.toByteArray();
157  }
158
159  /**
160   * Copies contents to a byte array.
161   *
162   * @param is raw input stream
163   * @param recordLength length of a record
164   * @param enforceLength enforce the length
165   * @return rawContents of body
166   * @throws IOException if there is an issue
167   */
168  private static byte[] copyToByteArray(final InputStream is,
169          final int recordLength, final boolean enforceLength)
170      throws IOException {
171
172    BoundedInputStream bis = new BoundedInputStream(is, recordLength);
173    byte[] rawContents = IOUtils.toByteArray(bis);
174    if (enforceLength && rawContents.length != recordLength) {
175      LOG.error("Read " + rawContents.length + " bytes but expected "
176              + recordLength + " bytes. Continuing...");
177    }
178    return rawContents;
179  }
180}