001/*
002 * Archives Unleashed Toolkit (AUT):
003 * An open-source platform for analyzing web archives.
004 *
005 * Licensed under the Apache License, Version 2.0 (the "License");
006 * you may not use this file except in compliance with the License.
007 * You may obtain a copy of the License at
008 *
009 *     http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package io.archivesunleashed.data;
018
019import java.io.BufferedInputStream;
020import java.io.ByteArrayInputStream;
021import java.io.ByteArrayOutputStream;
022import java.io.DataOutputStream;
023import java.io.IOException;
024import java.io.InputStream;
025import org.apache.commons.io.IOUtils;
026import org.apache.commons.io.input.BoundedInputStream;
027import org.apache.log4j.Logger;
028import org.archive.io.arc.ARCReader;
029import org.archive.io.arc.ARCReaderFactory;
030import org.archive.io.arc.ARCRecord;
031import org.archive.io.arc.ARCRecordMetaData;
032
033/**
034 * Utilities for working with {@code ARCRecord}s (from archive.org APIs).
035 */
036public final class ArcRecordUtils {
037
038  /**
039   * Utility classes should not have a public or default constructor.
040   */
041  private ArcRecordUtils() {
042  }
043
044  /**
045   * Setup logger.
046   */
047  private static final Logger LOG = Logger.getLogger(ArcRecordUtils.class);
048
049  /**
050   * Converts raw bytes into an {@code ARCRecord}.
051   *
052   * @param bytes raw bytes
053   * @return parsed {@code ARCRecord}
054   * @throws IOException if there is an issue
055   */
056  public static ARCRecord fromBytes(final byte[] bytes) throws IOException {
057    ARCReader reader = (ARCReader) ARCReaderFactory.get("",
058        new BufferedInputStream(new ByteArrayInputStream(bytes)), false);
059    return (ARCRecord) reader.get();
060  }
061
062  /**
063   * Converts ARC record into raw bytes.
064   *
065   * @param record conents of WARC response record
066   * @return raw contents
067   * @throws IOException if there is an issue
068   */
069  public static byte[] toBytes(final ARCRecord record) throws IOException {
070    ARCRecordMetaData meta = record.getMetaData();
071
072    String metaline = meta.getUrl() + " " + meta.getIp()
073        + " " + meta.getDate() + " " + meta.getMimetype()
074        + " " + (int) meta.getLength();
075
076    ByteArrayOutputStream baos = new ByteArrayOutputStream();
077    DataOutputStream dout = new DataOutputStream(baos);
078    dout.write(metaline.getBytes());
079    dout.write("\n".getBytes());
080
081    long recordLength = meta.getLength();
082    long len = IOUtils.copyLarge(new BoundedInputStream(record, recordLength),
083            dout);
084    if (len != recordLength) {
085      LOG.error("Read " + len + " bytes but expected " + recordLength
086              + " bytes. Continuing...");
087    }
088    return baos.toByteArray();
089  }
090
091  /**
092   * Extracts raw contents from an {@code ARCRecord} (including HTTP headers).
093   *
094   * @param record the {@code ARCRecord}
095   * @return raw contents
096   * @throws IOException if there is an issue
097   */
098  public static byte[] getContent(final ARCRecord record) throws IOException {
099    ARCRecordMetaData meta = record.getMetaData();
100
101    return copyToByteArray(record, (int) meta.getLength(), true);
102  }
103
104  /**
105   * Extracts contents of the body from an {@code ARCRecord}.
106   * Excludes HTTP headers.
107   *
108   * @param record the {@code ARCRecord}
109   * @return contents of the body
110   * @throws IOException if there is an issue
111   */
112  public static byte[] getBodyContent(final ARCRecord record)
113      throws IOException {
114    byte[] raw = getContent(record);
115    int bodyOffset = record.getBodyOffset();
116
117    byte[] content = null;
118    try {
119      content = new byte[raw.length - bodyOffset];
120      System.arraycopy(raw, bodyOffset, content, 0, content.length);
121    } catch (java.lang.NegativeArraySizeException e) {
122      // To find out what URL causing the error: record.getMetaData().getUrl()
123      // For some records, we're missing the actual content data, likely due
124      // to a crawler gitch. Nothing much we can do, just swallow and move on.
125      content = new byte[0];
126    }
127    return content;
128  }
129
130  /**
131   * Copies contents to a byte array.
132   *
133   * @param is raw input stream
134   * @param recordLength is length of a record
135   * @param enforceLength enforce the length
136   * @return rawContents of body
137   * @throws IOException if there is an issue
138   */
139  private static byte[] copyToByteArray(final InputStream is,
140          final int recordLength, final boolean enforceLength)
141      throws IOException {
142
143    BoundedInputStream bis = new BoundedInputStream(is, recordLength);
144    byte[] rawContents = IOUtils.toByteArray(bis);
145    if (enforceLength && rawContents.length != recordLength) {
146      LOG.error("Read " + rawContents.length + " bytes but expected "
147              + recordLength + " bytes. Continuing...");
148    }
149    return rawContents;
150  }
151}