001/*
002 * Archives Unleashed Toolkit (AUT):
003 * An open-source platform for analyzing web archives.
004 *
005 * Licensed under the Apache License, Version 2.0 (the "License");
006 * you may not use this file except in compliance with the License.
007 * You may obtain a copy of the License at
008 *
009 *     http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package io.archivesunleashed.data;
018
019import java.io.BufferedInputStream;
020import java.io.ByteArrayInputStream;
021import java.io.ByteArrayOutputStream;
022import java.io.DataOutputStream;
023import java.io.IOException;
024import java.io.InputStream;
025import org.apache.commons.io.IOUtils;
026import org.apache.commons.io.input.BoundedInputStream;
027import org.apache.log4j.Logger;
028import org.archive.io.arc.ARCReader;
029import org.archive.io.arc.ARCReaderFactory;
030import org.archive.io.arc.ARCRecord;
031import org.archive.io.arc.ARCRecordMetaData;
032
033/**
034 * Utilities for working with {@code ARCRecord}s (from archive.org APIs).
035 */
036public final class ArcRecordUtils {
037
038  /**
039   * Utility classes should not have a public or default constructor.
040   */
041  private ArcRecordUtils() {
042  }
043
044  /**
045   * Setup logger.
046   */
047  private static final Logger LOG = Logger.getLogger(ArcRecordUtils.class);
048
049  /**
050   * Converts raw bytes into an {@code ARCRecord}.
051   *
052   * @param bytes raw bytes
053   * @return parsed {@code ARCRecord}
054   * @throws IOException if there is an issue
055   */
056  public static ARCRecord fromBytes(final byte[] bytes) throws IOException {
057    ARCReader reader = (ARCReader) ARCReaderFactory.get("",
058        new BufferedInputStream(new ByteArrayInputStream(bytes)), false);
059    return (ARCRecord) reader.get();
060  }
061
062  /**
063   * Converts ARC record into raw bytes.
064   *
065   * @param record conents of WARC response record
066   * @return raw contents
067   * @throws IOException if there is an issue
068   */
069  public static byte[] toBytes(final ARCRecord record) throws IOException {
070    ARCRecordMetaData meta = record.getMetaData();
071
072    String metaline = meta.getUrl() + " " + meta.getIp()
073            + " " + meta.getDate() + " " + meta.getMimetype()
074            + " " + (int) meta.getLength();
075    String versionEtc = "";
076
077
078    if (meta.getOffset() == 0) {
079      versionEtc = "\n" + meta.getVersion().replace(".", " ")
080              + " " + meta.getOrigin() + "\n"
081              + "URL IP-address Archive-date Content-type Archive-length";
082      metaline += versionEtc;
083    }
084
085    ByteArrayOutputStream baos = new ByteArrayOutputStream();
086    DataOutputStream dout = new DataOutputStream(baos);
087    dout.write(metaline.getBytes());
088    dout.write("\n".getBytes());
089
090    long recordLength = meta.getLength() - versionEtc.length();
091    long len = IOUtils.copyLarge(new BoundedInputStream(record, recordLength),
092            dout);
093    if (len != recordLength) {
094      LOG.error("Read " + len + " bytes but expected " + recordLength
095              + " bytes. Continuing...");
096    }
097    return baos.toByteArray();
098  }
099
100  /**
101   * Extracts raw contents from an {@code ARCRecord} (including HTTP headers).
102   *
103   * @param record the {@code ARCRecord}
104   * @return raw contents
105   * @throws IOException if there is an issue
106   */
107  public static byte[] getContent(final ARCRecord record) throws IOException {
108    ARCRecordMetaData meta = record.getMetaData();
109    String versionEtc = "";
110
111    if (meta.getOffset() == 0) {
112      versionEtc = "\n" + meta.getVersion().replace(".", " ")
113              + " " + meta.getOrigin() + "\n"
114              + "URL IP-address Archive-date Content-type Archive-length";
115    }
116
117    return copyToByteArray(record, (int) meta.getLength()
118            - versionEtc.length(), true);
119  }
120
121  /**
122   * Extracts contents of the body from an {@code ARCRecord}.
123   * Excludes HTTP headers.
124   *
125   * @param record the {@code ARCRecord}
126   * @return contents of the body
127   * @throws IOException if there is an issue
128   */
129  public static byte[] getBodyContent(final ARCRecord record)
130      throws IOException {
131    byte[] raw = getContent(record);
132    int bodyOffset = record.getBodyOffset();
133
134    byte[] content = null;
135    try {
136      content = new byte[raw.length - bodyOffset];
137      System.arraycopy(raw, bodyOffset, content, 0, content.length);
138    } catch (java.lang.NegativeArraySizeException e) {
139      // To find out what URL causing the error: record.getMetaData().getUrl()
140      // For some records, we're missing the actual content data, likely due
141      // to a crawler gitch. Nothing much we can do, just swallow and move on.
142      content = new byte[0];
143    }
144    return content;
145  }
146
147  /**
148   * Copies contents to a byte array.
149   *
150   * @param is raw input stream
151   * @param recordLength is length of a record
152   * @param enforceLength enforce the length
153   * @return rawContents of body
154   * @throws IOException if there is an issue
155   */
156  private static byte[] copyToByteArray(final InputStream is,
157          final int recordLength, final boolean enforceLength)
158      throws IOException {
159
160    BoundedInputStream bis = new BoundedInputStream(is, recordLength);
161    byte[] rawContents = IOUtils.toByteArray(bis);
162    if (enforceLength && rawContents.length != recordLength) {
163      LOG.error("Read " + rawContents.length + " bytes but expected "
164              + recordLength + " bytes. Continuing...");
165    }
166    return rawContents;
167  }
168}