001/*
002 * Copyright © 2017 The Archives Unleashed Project
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016package io.archivesunleashed.data;
017
018import java.io.BufferedInputStream;
019import java.io.ByteArrayInputStream;
020import java.io.ByteArrayOutputStream;
021import java.io.DataOutputStream;
022import java.io.IOException;
023import java.io.InputStream;
024import java.util.Map;
025import java.util.regex.Matcher;
026import java.util.regex.Pattern;
027import org.apache.commons.httpclient.HttpParser;
028import org.apache.commons.io.IOUtils;
029import org.apache.commons.io.input.BoundedInputStream;
030import org.apache.log4j.Logger;
031import org.archive.format.warc.WARCConstants;
032import org.archive.io.warc.WARCReader;
033import org.archive.io.warc.WARCReaderFactory;
034import org.archive.io.warc.WARCRecord;
035
036/**
037 * Utilities for working with {@code WARCRecord}s (from archive.org APIs).
038 */
039public final class WarcRecordUtils implements WARCConstants {
040
041  /**
042   * Utility classes should not have a public or default constructor.
043   */
044  private WarcRecordUtils() {
045  }
046
047  /**
048  * Setup logger.
049  */
050  private static final Logger LOG = Logger.getLogger(WarcRecordUtils.class);
051
052  /**
053   * Converts raw bytes into an {@code WARCRecord}.
054   *
055   * @param bytes raw bytes
056   * @return parsed {@code WARCRecord}
057   * @throws IOException if there is an issue
058   */
059  public static WARCRecord fromBytes(final byte[] bytes) throws IOException {
060    WARCReader reader = (WARCReader) WARCReaderFactory.get("",
061        new BufferedInputStream(new ByteArrayInputStream(bytes)), false);
062    return (WARCRecord) reader.get();
063  }
064
065  /**
066   * Converts WARC record into raw bytes.
067   *
068   * @param record conents of WARC response record
069   * @return raw contents
070   * @throws IOException if there is an issue
071   */
072  public static byte[] toBytes(final WARCRecord record) throws IOException {
073    ByteArrayOutputStream baos = new ByteArrayOutputStream();
074    DataOutputStream dout = new DataOutputStream(baos);
075
076    dout.write("WARC/0.17\n".getBytes());
077    for (Map.Entry<String, Object> entry : record.getHeader()
078            .getHeaderFields().entrySet()) {
079      dout.write((entry.getKey() + ": " + entry.getValue().toString() + "\n")
080              .getBytes());
081    }
082    dout.write("\n".getBytes());
083    record.dump(dout);
084
085    return baos.toByteArray();
086  }
087
088  /**
089   * Extracts the MIME type of WARC response records.
090   * "WARC-Type" is "response".
091   * Note that this is different from the "Content-Type" in the WARC header.
092   *
093   * @param contents raw contents of the WARC response record
094   * @return MIME type
095   */
096  public static String getWarcResponseMimeType(final byte[] contents) {
097    // This is a somewhat janky way to get the MIME type of the response.
098    // Moreover, this simple regex is not compliant with the specification.
099    // See: https://www.w3.org/Protocols/rfc1341/4_Content-Type.html
100    // It would be much better to parse all headers using an external library:
101    //   org.apache.commons.httpclient.HeaderElement
102    // Note that this is different from the "Content-Type" in the WARC header.
103    Pattern pattern = Pattern.compile("Content-Type: ([^\\s;]+) *(;.*)?",
104            Pattern.CASE_INSENSITIVE);
105    Matcher matcher = pattern.matcher(new String(contents));
106    if (matcher.find()) {
107      return matcher.group(1).replaceAll(";$", "");
108    }
109
110    return null;
111  }
112
113  /**
114   * Extracts raw contents from a {@code WARCRecord} (including HTTP headers).
115   *
116   * @param record the {@code WARCRecord}
117   * @return raw contents
118   * @throws IOException if there is an issue
119   */
120  public static byte[] getContent(final WARCRecord record) throws IOException {
121    int len = (int) record.getHeader().getContentLength();
122
123    // If we have a corrupt record, quit and move on.
124    if (len < 0) {
125      return new byte[0];
126    }
127
128    try {
129      return copyToByteArray(record, len, true);
130    } catch (Exception e) {
131      // Catch exceptions related to any corrupt archive files.
132      return new byte[0];
133    }
134  }
135
136  /**
137   * Extracts contents of the body from a {@code WARCRecord}.
138   * Excludes HTTP headers.
139   *
140   * @param record the {@code WARCRecord}
141   * @return contents of the body
142   * @throws IOException if there is an issue
143   */
144  public static byte[] getBodyContent(final WARCRecord record)
145      throws IOException {
146    ByteArrayOutputStream baos = new ByteArrayOutputStream();
147    String line = HttpParser.readLine(record, WARC_HEADER_ENCODING);
148    if (line == null) {
149      return null;
150    }
151
152    // Just using parseHeaders to move down input stream to body.
153    HttpParser.parseHeaders(record, WARC_HEADER_ENCODING);
154    record.dump(baos);
155    return baos.toByteArray();
156  }
157
158  /**
159   * Copies contents to a byte array.
160   *
161   * @param is raw input stream
162   * @param recordLength length of a record
163   * @param enforceLength enforce the length
164   * @return rawContents of body
165   * @throws IOException if there is an issue
166   */
167  private static byte[] copyToByteArray(final InputStream is,
168          final int recordLength, final boolean enforceLength)
169      throws IOException {
170
171    BoundedInputStream bis = new BoundedInputStream(is, recordLength);
172    byte[] rawContents = IOUtils.toByteArray(bis);
173    if (enforceLength && rawContents.length != recordLength) {
174      LOG.error("Read " + rawContents.length + " bytes but expected "
175              + recordLength + " bytes. Continuing...");
176    }
177    return rawContents;
178  }
179}