001/*
002 * Archives Unleashed Toolkit (AUT):
003 * An open-source platform for analyzing web archives.
004 *
005 * Licensed under the Apache License, Version 2.0 (the "License");
006 * you may not use this file except in compliance with the License.
007 * You may obtain a copy of the License at
008 *
009 *     http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package io.archivesunleashed.data;
018
019import java.io.BufferedInputStream;
020import java.io.ByteArrayInputStream;
021import java.io.ByteArrayOutputStream;
022import java.io.DataOutputStream;
023import java.io.IOException;
024import java.io.InputStream;
025import org.apache.commons.io.IOUtils;
026import org.apache.commons.io.input.BoundedInputStream;
027import org.apache.log4j.Logger;
028import org.archive.io.arc.ARCReader;
029import org.archive.io.arc.ARCReaderFactory;
030import org.archive.io.arc.ARCRecord;
031import org.archive.io.arc.ARCRecordMetaData;
032
033/**
034 * Utilities for working with {@code ARCRecord}s (from archive.org APIs).
035 */
036public final class ArcRecordUtils {
037
038  /**
039   * Utility classes should not have a public or default constructor.
040   */
041  private ArcRecordUtils() {
042  }
043
044  /**
045   * Setup logger.
046   */
047  private static final Logger LOG = Logger.getLogger(ArcRecordUtils.class);
048
049  /**
050   * Converts raw bytes into an {@code ARCRecord}.
051   *
052   * @param bytes raw bytes
053   * @return parsed {@code ARCRecord}
054   * @throws IOException if there is an issue
055   */
056  public static ARCRecord fromBytes(final byte[] bytes) throws IOException {
057    ARCReader reader = (ARCReader) ARCReaderFactory.get("",
058        new BufferedInputStream(new ByteArrayInputStream(bytes)), false);
059    return (ARCRecord) reader.get();
060  }
061
062  /**
063   * Converts ARC record into raw bytes.
064   *
065   * @param record conents of WARC response record
066   * @return raw contents
067   * @throws IOException if there is an issue
068   */
069  public static byte[] toBytes(final ARCRecord record) throws IOException {
070    ARCRecordMetaData meta = record.getMetaData();
071
072    String metaline = meta.getUrl() + " " + meta.getIp()
073            + " " + meta.getDate() + " " + meta.getMimetype()
074            + " " + (int) meta.getLength();
075    String versionEtc = "";
076
077
078    if (meta.getOffset() == 0) {
079      versionEtc = "\n" + meta.getVersion().replace(".", " ")
080              + " " + meta.getOrigin() + "\n"
081              + "URL IP-address Archive-date Content-type Archive-length";
082      metaline += versionEtc;
083    }
084
085    ByteArrayOutputStream baos = new ByteArrayOutputStream();
086    DataOutputStream dout = new DataOutputStream(baos);
087    dout.write(metaline.getBytes());
088    dout.write("\n".getBytes());
089
090    long recordLength = meta.getLength() - versionEtc.length();
091    long len = IOUtils.copyLarge(new BoundedInputStream(record, recordLength),
092            dout);
093    if (len != recordLength) {
094      LOG.error("Read " + len + " bytes but expected " + recordLength
095              + " bytes. Continuing...");
096    }
097    return baos.toByteArray();
098  }
099
100  /**
101   * Extracts raw contents from an {@code ARCRecord} (including HTTP headers).
102   *
103   * @param record the {@code ARCRecord}
104   * @return raw contents
105   * @throws IOException if there is an issue
106   */
107  public static byte[] getContent(final ARCRecord record) throws IOException {
108    ARCRecordMetaData meta = record.getMetaData();
109    String versionEtc = "";
110
111    if (meta.getOffset() == 0) {
112      versionEtc = "\n" + meta.getVersion().replace(".", " ")
113              + " " + meta.getOrigin() + "\n"
114              + "URL IP-address Archive-date Content-type Archive-length";
115    }
116
117    try {
118      return copyToByteArray(record, (int) meta.getLength()
119              - versionEtc.length(), true);
120    } catch (Exception e) {
121      // Catch exceptions related to any corrupt archive files.
122      return new byte[0];
123    }
124  }
125
126  /**
127   * Extracts contents of the body from an {@code ARCRecord}.
128   * Excludes HTTP headers.
129   *
130   * @param record the {@code ARCRecord}
131   * @return contents of the body
132   * @throws IOException if there is an issue
133   */
134  public static byte[] getBodyContent(final ARCRecord record)
135      throws IOException {
136    byte[] raw = getContent(record);
137    int bodyOffset = record.getBodyOffset();
138
139    byte[] content = null;
140    try {
141      content = new byte[raw.length - bodyOffset];
142      System.arraycopy(raw, bodyOffset, content, 0, content.length);
143    } catch (java.lang.NegativeArraySizeException e) {
144      // To find out what URL causing the error: record.getMetaData().getUrl()
145      // For some records, we're missing the actual content data, likely due
146      // to a crawler gitch. Nothing much we can do, just swallow and move on.
147      content = new byte[0];
148    }
149    return content;
150  }
151
152  /**
153   * Copies contents to a byte array.
154   *
155   * @param is raw input stream
156   * @param recordLength is length of a record
157   * @param enforceLength enforce the length
158   * @return rawContents of body
159   * @throws IOException if there is an issue
160   */
161  private static byte[] copyToByteArray(final InputStream is,
162          final int recordLength, final boolean enforceLength)
163      throws IOException {
164
165    BoundedInputStream bis = new BoundedInputStream(is, recordLength);
166    byte[] rawContents = IOUtils.toByteArray(bis);
167    if (enforceLength && rawContents.length != recordLength) {
168      LOG.error("Read " + rawContents.length + " bytes but expected "
169              + recordLength + " bytes. Continuing...");
170    }
171    return rawContents;
172  }
173}