001/*
002 * Copyright © 2017 The Archives Unleashed Project
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016package io.archivesunleashed.data;
017
018import java.io.BufferedInputStream;
019import java.io.ByteArrayInputStream;
020import java.io.ByteArrayOutputStream;
021import java.io.DataOutputStream;
022import java.io.IOException;
023import java.io.InputStream;
024import org.apache.commons.io.IOUtils;
025import org.apache.commons.io.input.BoundedInputStream;
026import org.apache.log4j.Logger;
027import org.archive.io.arc.ARCReader;
028import org.archive.io.arc.ARCReaderFactory;
029import org.archive.io.arc.ARCRecord;
030import org.archive.io.arc.ARCRecordMetaData;
031
032/**
033 * Utilities for working with {@code ARCRecord}s (from archive.org APIs).
034 */
035public final class ArcRecordUtils {
036
037  /**
038   * Utility classes should not have a public or default constructor.
039   */
040  private ArcRecordUtils() {
041  }
042
043  /**
044   * Setup logger.
045   */
046  private static final Logger LOG = Logger.getLogger(ArcRecordUtils.class);
047
048  /**
049   * Converts raw bytes into an {@code ARCRecord}.
050   *
051   * @param bytes raw bytes
052   * @return parsed {@code ARCRecord}
053   * @throws IOException if there is an issue
054   */
055  public static ARCRecord fromBytes(final byte[] bytes) throws IOException {
056    ARCReader reader = (ARCReader) ARCReaderFactory.get("",
057        new BufferedInputStream(new ByteArrayInputStream(bytes)), false);
058    return (ARCRecord) reader.get();
059  }
060
061  /**
062   * Converts ARC record into raw bytes.
063   *
064   * @param record conents of WARC response record
065   * @return raw contents
066   * @throws IOException if there is an issue
067   */
068  public static byte[] toBytes(final ARCRecord record) throws IOException {
069    ARCRecordMetaData meta = record.getMetaData();
070
071    String metaline = meta.getUrl() + " " + meta.getIp()
072            + " " + meta.getDate() + " " + meta.getMimetype()
073            + " " + (int) meta.getLength();
074    String versionEtc = "";
075
076
077    if (meta.getOffset() == 0) {
078      versionEtc = "\n" + meta.getVersion().replace(".", " ")
079              + " " + meta.getOrigin() + "\n"
080              + "URL IP-address Archive-date Content-type Archive-length";
081      metaline += versionEtc;
082    }
083
084    ByteArrayOutputStream baos = new ByteArrayOutputStream();
085    DataOutputStream dout = new DataOutputStream(baos);
086    dout.write(metaline.getBytes());
087    dout.write("\n".getBytes());
088
089    long recordLength = meta.getLength() - versionEtc.length();
090    long len = IOUtils.copyLarge(new BoundedInputStream(record, recordLength),
091            dout);
092    if (len != recordLength) {
093      LOG.error("Read " + len + " bytes but expected " + recordLength
094              + " bytes. Continuing...");
095    }
096    return baos.toByteArray();
097  }
098
099  /**
100   * Extracts raw contents from an {@code ARCRecord} (including HTTP headers).
101   *
102   * @param record the {@code ARCRecord}
103   * @return raw contents
104   * @throws IOException if there is an issue
105   */
106  public static byte[] getContent(final ARCRecord record) throws IOException {
107    ARCRecordMetaData meta = record.getMetaData();
108    String versionEtc = "";
109
110    if (meta.getOffset() == 0) {
111      versionEtc = "\n" + meta.getVersion().replace(".", " ")
112              + " " + meta.getOrigin() + "\n"
113              + "URL IP-address Archive-date Content-type Archive-length";
114    }
115
116    try {
117      return copyToByteArray(record, (int) meta.getLength()
118              - versionEtc.length(), true);
119    } catch (Exception e) {
120      // Catch exceptions related to any corrupt archive files.
121      return new byte[0];
122    }
123  }
124
125  /**
126   * Extracts contents of the body from an {@code ARCRecord}.
127   * Excludes HTTP headers.
128   *
129   * @param record the {@code ARCRecord}
130   * @return contents of the body
131   * @throws IOException if there is an issue
132   */
133  public static byte[] getBodyContent(final ARCRecord record)
134      throws IOException {
135    byte[] raw = getContent(record);
136    int bodyOffset = record.getBodyOffset();
137
138    byte[] content = null;
139    try {
140      content = new byte[raw.length - bodyOffset];
141      System.arraycopy(raw, bodyOffset, content, 0, content.length);
142    } catch (java.lang.NegativeArraySizeException e) {
143      // To find out what URL causing the error: record.getMetaData().getUrl()
144      // For some records, we're missing the actual content data, likely due
145      // to a crawler gitch. Nothing much we can do, just swallow and move on.
146      content = new byte[0];
147    }
148    return content;
149  }
150
151  /**
152   * Copies contents to a byte array.
153   *
154   * @param is raw input stream
155   * @param recordLength is length of a record
156   * @param enforceLength enforce the length
157   * @return rawContents of body
158   * @throws IOException if there is an issue
159   */
160  private static byte[] copyToByteArray(final InputStream is,
161          final int recordLength, final boolean enforceLength)
162      throws IOException {
163
164    BoundedInputStream bis = new BoundedInputStream(is, recordLength);
165    byte[] rawContents = IOUtils.toByteArray(bis);
166    if (enforceLength && rawContents.length != recordLength) {
167      LOG.error("Read " + rawContents.length + " bytes but expected "
168              + recordLength + " bytes. Continuing...");
169    }
170    return rawContents;
171  }
172}