001/*
002 * Copyright © 2017 The Archives Unleashed Project
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016package io.archivesunleashed.data;
017
018import java.io.BufferedInputStream;
019import java.io.ByteArrayInputStream;
020import java.io.ByteArrayOutputStream;
021import java.io.DataOutputStream;
022import java.io.IOException;
023import java.io.InputStream;
024import org.apache.commons.io.IOUtils;
025import org.apache.commons.io.input.BoundedInputStream;
026import org.apache.log4j.Logger;
027import org.archive.io.arc.ARCReader;
028import org.archive.io.arc.ARCReaderFactory;
029import org.archive.io.arc.ARCRecord;
030import org.archive.io.arc.ARCRecordMetaData;
031
032/** Utilities for working with {@code ARCRecord}s (from archive.org APIs). */
033public final class ArcRecordUtils {
034
035  /** Utility classes should not have a public or default constructor. */
036  private ArcRecordUtils() {}
037
038  /** Setup logger. */
039  private static final Logger LOG = Logger.getLogger(ArcRecordUtils.class);
040
041  /**
042   * Converts raw bytes into an {@code ARCRecord}.
043   *
044   * @param bytes raw bytes
045   * @return parsed {@code ARCRecord}
046   * @throws IOException if there is an issue
047   */
048  public static ARCRecord fromBytes(final byte[] bytes) throws IOException {
049    ARCReader reader =
050        (ARCReader)
051            ARCReaderFactory.get(
052                "", new BufferedInputStream(new ByteArrayInputStream(bytes)), false);
053    return (ARCRecord) reader.get();
054  }
055
056  /**
057   * Converts ARC record into raw bytes.
058   *
059   * @param record conents of WARC response record
060   * @return raw contents
061   * @throws IOException if there is an issue
062   */
063  public static byte[] toBytes(final ARCRecord record) throws IOException {
064    ARCRecordMetaData meta = record.getMetaData();
065
066    String metaline =
067        meta.getUrl()
068            + " "
069            + meta.getIp()
070            + " "
071            + meta.getDate()
072            + " "
073            + meta.getMimetype()
074            + " "
075            + (int) meta.getLength();
076    String versionEtc = "";
077
078    if (meta.getOffset() == 0) {
079      versionEtc =
080          "\n"
081              + meta.getVersion().replace(".", " ")
082              + " "
083              + meta.getOrigin()
084              + "\n"
085              + "URL IP-address Archive-date Content-type Archive-length";
086      metaline += versionEtc;
087    }
088
089    ByteArrayOutputStream baos = new ByteArrayOutputStream();
090    DataOutputStream dout = new DataOutputStream(baos);
091    dout.write(metaline.getBytes());
092    dout.write("\n".getBytes());
093
094    long recordLength = meta.getLength() - versionEtc.length();
095    long len = IOUtils.copyLarge(new BoundedInputStream(record, recordLength), dout);
096    if (len != recordLength) {
097      LOG.error("Read " + len + " bytes but expected " + recordLength + " bytes. Continuing...");
098    }
099    return baos.toByteArray();
100  }
101
102  /**
103   * Extracts raw contents from an {@code ARCRecord} (including HTTP headers).
104   *
105   * @param record the {@code ARCRecord}
106   * @return raw contents
107   * @throws IOException if there is an issue
108   */
109  public static byte[] getContent(final ARCRecord record) throws IOException {
110    ARCRecordMetaData meta = record.getMetaData();
111    String versionEtc = "";
112
113    if (meta.getOffset() == 0) {
114      versionEtc =
115          "\n"
116              + meta.getVersion().replace(".", " ")
117              + " "
118              + meta.getOrigin()
119              + "\n"
120              + "URL IP-address Archive-date Content-type Archive-length";
121    }
122
123    try {
124      return copyToByteArray(record, (int) meta.getLength() - versionEtc.length(), true);
125    } catch (Exception e) {
126      // Catch exceptions related to any corrupt archive files.
127      return new byte[0];
128    }
129  }
130
131  /**
132   * Extracts contents of the body from an {@code ARCRecord}. Excludes HTTP headers.
133   *
134   * @param record the {@code ARCRecord}
135   * @return contents of the body
136   * @throws IOException if there is an issue
137   */
138  public static byte[] getBodyContent(final ARCRecord record) throws IOException {
139    byte[] raw = getContent(record);
140    int bodyOffset = record.getBodyOffset();
141
142    byte[] content = null;
143    try {
144      content = new byte[raw.length - bodyOffset];
145      System.arraycopy(raw, bodyOffset, content, 0, content.length);
146    } catch (java.lang.NegativeArraySizeException e) {
147      // To find out what URL causing the error: record.getMetaData().getUrl()
148      // For some records, we're missing the actual content data, likely due
149      // to a crawler gitch. Nothing much we can do, just swallow and move on.
150      content = new byte[0];
151    }
152    return content;
153  }
154
155  /**
156   * Copies contents to a byte array.
157   *
158   * @param is raw input stream
159   * @param recordLength is length of a record
160   * @param enforceLength enforce the length
161   * @return rawContents of body
162   * @throws IOException if there is an issue
163   */
164  private static byte[] copyToByteArray(
165      final InputStream is, final int recordLength, final boolean enforceLength)
166      throws IOException {
167
168    BoundedInputStream bis = new BoundedInputStream(is, recordLength);
169    byte[] rawContents = IOUtils.toByteArray(bis);
170    if (enforceLength && rawContents.length != recordLength) {
171      LOG.error(
172          "Read "
173              + rawContents.length
174              + " bytes but expected "
175              + recordLength
176              + " bytes. Continuing...");
177    }
178    return rawContents;
179  }
180}