001/*
002 * Copyright © 2017 The Archives Unleashed Project
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016package io.archivesunleashed.data;
017
018import java.io.BufferedInputStream;
019import java.io.ByteArrayInputStream;
020import java.io.ByteArrayOutputStream;
021import java.io.DataOutputStream;
022import java.io.IOException;
023import java.io.InputStream;
024import java.util.Map;
025import java.util.regex.Matcher;
026import java.util.regex.Pattern;
027import org.apache.commons.httpclient.HttpParser;
028import org.apache.commons.io.IOUtils;
029import org.apache.commons.io.input.BoundedInputStream;
030import org.apache.log4j.Logger;
031import org.archive.format.warc.WARCConstants;
032import org.archive.io.warc.WARCReader;
033import org.archive.io.warc.WARCReaderFactory;
034import org.archive.io.warc.WARCRecord;
035
036/** Utilities for working with {@code WARCRecord}s (from archive.org APIs). */
037public final class WarcRecordUtils implements WARCConstants {
038
039  /** Utility classes should not have a public or default constructor. */
040  private WarcRecordUtils() {}
041
042  /** Setup logger. */
043  private static final Logger LOG = Logger.getLogger(WarcRecordUtils.class);
044
045  /**
046   * Converts raw bytes into an {@code WARCRecord}.
047   *
048   * @param bytes raw bytes
049   * @return parsed {@code WARCRecord}
050   * @throws IOException if there is an issue
051   */
052  public static WARCRecord fromBytes(final byte[] bytes) throws IOException {
053    WARCReader reader =
054        (WARCReader)
055            WARCReaderFactory.get(
056                "", new BufferedInputStream(new ByteArrayInputStream(bytes)), false);
057    return (WARCRecord) reader.get();
058  }
059
060  /**
061   * Converts WARC record into raw bytes.
062   *
063   * @param record conents of WARC response record
064   * @return raw contents
065   * @throws IOException if there is an issue
066   */
067  public static byte[] toBytes(final WARCRecord record) throws IOException {
068    ByteArrayOutputStream baos = new ByteArrayOutputStream();
069    DataOutputStream dout = new DataOutputStream(baos);
070
071    dout.write("WARC/0.17\n".getBytes());
072    for (Map.Entry<String, Object> entry : record.getHeader().getHeaderFields().entrySet()) {
073      dout.write((entry.getKey() + ": " + entry.getValue().toString() + "\n").getBytes());
074    }
075    dout.write("\n".getBytes());
076    record.dump(dout);
077
078    return baos.toByteArray();
079  }
080
081  /**
082   * Extracts the MIME type of WARC response records. "WARC-Type" is "response". Note that this is
083   * different from the "Content-Type" in the WARC header.
084   *
085   * @param contents raw contents of the WARC response record
086   * @return MIME type
087   */
088  public static String getWarcResponseMimeType(final byte[] contents) {
089    // This is a somewhat janky way to get the MIME type of the response.
090    // Moreover, this simple regex is not compliant with the specification.
091    // See: https://www.w3.org/Protocols/rfc1341/4_Content-Type.html
092    // It would be much better to parse all headers using an external library:
093    //   org.apache.commons.httpclient.HeaderElement
094    // Note that this is different from the "Content-Type" in the WARC header.
095    Pattern pattern = Pattern.compile("Content-Type: ([^\\s;]+) *(;.*)?", Pattern.CASE_INSENSITIVE);
096    Matcher matcher = pattern.matcher(new String(contents));
097    if (matcher.find()) {
098      return matcher.group(1).replaceAll(";$", "");
099    }
100
101    return null;
102  }
103
104  /**
105   * Extracts raw contents from a {@code WARCRecord} (including HTTP headers).
106   *
107   * @param record the {@code WARCRecord}
108   * @return raw contents
109   * @throws IOException if there is an issue
110   */
111  public static byte[] getContent(final WARCRecord record) throws IOException {
112    int len = (int) record.getHeader().getContentLength();
113
114    // If we have a corrupt record, quit and move on.
115    if (len < 0) {
116      return new byte[0];
117    }
118
119    try {
120      return copyToByteArray(record, len, true);
121    } catch (Exception e) {
122      // Catch exceptions related to any corrupt archive files.
123      return new byte[0];
124    }
125  }
126
127  /**
128   * Extracts contents of the body from a {@code WARCRecord}. Excludes HTTP headers.
129   *
130   * @param record the {@code WARCRecord}
131   * @return contents of the body
132   * @throws IOException if there is an issue
133   */
134  public static byte[] getBodyContent(final WARCRecord record) throws IOException {
135    ByteArrayOutputStream baos = new ByteArrayOutputStream();
136    String line = HttpParser.readLine(record, WARC_HEADER_ENCODING);
137    if (line == null) {
138      return null;
139    }
140
141    // Just using parseHeaders to move down input stream to body.
142    HttpParser.parseHeaders(record, WARC_HEADER_ENCODING);
143    record.dump(baos);
144    return baos.toByteArray();
145  }
146
147  /**
148   * Copies contents to a byte array.
149   *
150   * @param is raw input stream
151   * @param recordLength length of a record
152   * @param enforceLength enforce the length
153   * @return rawContents of body
154   * @throws IOException if there is an issue
155   */
156  private static byte[] copyToByteArray(
157      final InputStream is, final int recordLength, final boolean enforceLength)
158      throws IOException {
159
160    BoundedInputStream bis = new BoundedInputStream(is, recordLength);
161    byte[] rawContents = IOUtils.toByteArray(bis);
162    if (enforceLength && rawContents.length != recordLength) {
163      LOG.error(
164          "Read "
165              + rawContents.length
166              + " bytes but expected "
167              + recordLength
168              + " bytes. Continuing...");
169    }
170    return rawContents;
171  }
172}