001/* 002 * Copyright © 2017 The Archives Unleashed Project 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package io.archivesunleashed.data; 017 018import java.io.BufferedInputStream; 019import java.io.ByteArrayInputStream; 020import java.io.ByteArrayOutputStream; 021import java.io.DataOutputStream; 022import java.io.IOException; 023import java.io.InputStream; 024import org.apache.commons.io.IOUtils; 025import org.apache.commons.io.input.BoundedInputStream; 026import org.apache.log4j.Logger; 027import org.archive.io.arc.ARCReader; 028import org.archive.io.arc.ARCReaderFactory; 029import org.archive.io.arc.ARCRecord; 030import org.archive.io.arc.ARCRecordMetaData; 031 032/** 033 * Utilities for working with {@code ARCRecord}s (from archive.org APIs). 034 */ 035public final class ArcRecordUtils { 036 037 /** 038 * Utility classes should not have a public or default constructor. 039 */ 040 private ArcRecordUtils() { 041 } 042 043 /** 044 * Setup logger. 045 */ 046 private static final Logger LOG = Logger.getLogger(ArcRecordUtils.class); 047 048 /** 049 * Converts raw bytes into an {@code ARCRecord}. 050 * 051 * @param bytes raw bytes 052 * @return parsed {@code ARCRecord} 053 * @throws IOException if there is an issue 054 */ 055 public static ARCRecord fromBytes(final byte[] bytes) throws IOException { 056 ARCReader reader = (ARCReader) ARCReaderFactory.get("", 057 new BufferedInputStream(new ByteArrayInputStream(bytes)), false); 058 return (ARCRecord) reader.get(); 059 } 060 061 /** 062 * Converts ARC record into raw bytes. 063 * 064 * @param record conents of WARC response record 065 * @return raw contents 066 * @throws IOException if there is an issue 067 */ 068 public static byte[] toBytes(final ARCRecord record) throws IOException { 069 ARCRecordMetaData meta = record.getMetaData(); 070 071 String metaline = meta.getUrl() + " " + meta.getIp() 072 + " " + meta.getDate() + " " + meta.getMimetype() 073 + " " + (int) meta.getLength(); 074 String versionEtc = ""; 075 076 077 if (meta.getOffset() == 0) { 078 versionEtc = "\n" + meta.getVersion().replace(".", " ") 079 + " " + meta.getOrigin() + "\n" 080 + "URL IP-address Archive-date Content-type Archive-length"; 081 metaline += versionEtc; 082 } 083 084 ByteArrayOutputStream baos = new ByteArrayOutputStream(); 085 DataOutputStream dout = new DataOutputStream(baos); 086 dout.write(metaline.getBytes()); 087 dout.write("\n".getBytes()); 088 089 long recordLength = meta.getLength() - versionEtc.length(); 090 long len = IOUtils.copyLarge(new BoundedInputStream(record, recordLength), 091 dout); 092 if (len != recordLength) { 093 LOG.error("Read " + len + " bytes but expected " + recordLength 094 + " bytes. Continuing..."); 095 } 096 return baos.toByteArray(); 097 } 098 099 /** 100 * Extracts raw contents from an {@code ARCRecord} (including HTTP headers). 101 * 102 * @param record the {@code ARCRecord} 103 * @return raw contents 104 * @throws IOException if there is an issue 105 */ 106 public static byte[] getContent(final ARCRecord record) throws IOException { 107 ARCRecordMetaData meta = record.getMetaData(); 108 String versionEtc = ""; 109 110 if (meta.getOffset() == 0) { 111 versionEtc = "\n" + meta.getVersion().replace(".", " ") 112 + " " + meta.getOrigin() + "\n" 113 + "URL IP-address Archive-date Content-type Archive-length"; 114 } 115 116 try { 117 return copyToByteArray(record, (int) meta.getLength() 118 - versionEtc.length(), true); 119 } catch (Exception e) { 120 // Catch exceptions related to any corrupt archive files. 121 return new byte[0]; 122 } 123 } 124 125 /** 126 * Extracts contents of the body from an {@code ARCRecord}. 127 * Excludes HTTP headers. 128 * 129 * @param record the {@code ARCRecord} 130 * @return contents of the body 131 * @throws IOException if there is an issue 132 */ 133 public static byte[] getBodyContent(final ARCRecord record) 134 throws IOException { 135 byte[] raw = getContent(record); 136 int bodyOffset = record.getBodyOffset(); 137 138 byte[] content = null; 139 try { 140 content = new byte[raw.length - bodyOffset]; 141 System.arraycopy(raw, bodyOffset, content, 0, content.length); 142 } catch (java.lang.NegativeArraySizeException e) { 143 // To find out what URL causing the error: record.getMetaData().getUrl() 144 // For some records, we're missing the actual content data, likely due 145 // to a crawler gitch. Nothing much we can do, just swallow and move on. 146 content = new byte[0]; 147 } 148 return content; 149 } 150 151 /** 152 * Copies contents to a byte array. 153 * 154 * @param is raw input stream 155 * @param recordLength is length of a record 156 * @param enforceLength enforce the length 157 * @return rawContents of body 158 * @throws IOException if there is an issue 159 */ 160 private static byte[] copyToByteArray(final InputStream is, 161 final int recordLength, final boolean enforceLength) 162 throws IOException { 163 164 BoundedInputStream bis = new BoundedInputStream(is, recordLength); 165 byte[] rawContents = IOUtils.toByteArray(bis); 166 if (enforceLength && rawContents.length != recordLength) { 167 LOG.error("Read " + rawContents.length + " bytes but expected " 168 + recordLength + " bytes. Continuing..."); 169 } 170 return rawContents; 171 } 172}