001/* 002 * Copyright © 2017 The Archives Unleashed Project 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package io.archivesunleashed.data; 017 018import java.io.BufferedInputStream; 019import java.io.ByteArrayInputStream; 020import java.io.ByteArrayOutputStream; 021import java.io.DataOutputStream; 022import java.io.IOException; 023import java.io.InputStream; 024import org.apache.commons.io.IOUtils; 025import org.apache.commons.io.input.BoundedInputStream; 026import org.apache.log4j.Logger; 027import org.archive.io.arc.ARCReader; 028import org.archive.io.arc.ARCReaderFactory; 029import org.archive.io.arc.ARCRecord; 030import org.archive.io.arc.ARCRecordMetaData; 031 032/** Utilities for working with {@code ARCRecord}s (from archive.org APIs). */ 033public final class ArcRecordUtils { 034 035 /** Utility classes should not have a public or default constructor. */ 036 private ArcRecordUtils() {} 037 038 /** Setup logger. */ 039 private static final Logger LOG = Logger.getLogger(ArcRecordUtils.class); 040 041 /** 042 * Converts raw bytes into an {@code ARCRecord}. 043 * 044 * @param bytes raw bytes 045 * @return parsed {@code ARCRecord} 046 * @throws IOException if there is an issue 047 */ 048 public static ARCRecord fromBytes(final byte[] bytes) throws IOException { 049 ARCReader reader = 050 (ARCReader) 051 ARCReaderFactory.get( 052 "", new BufferedInputStream(new ByteArrayInputStream(bytes)), false); 053 return (ARCRecord) reader.get(); 054 } 055 056 /** 057 * Converts ARC record into raw bytes. 058 * 059 * @param record conents of WARC response record 060 * @return raw contents 061 * @throws IOException if there is an issue 062 */ 063 public static byte[] toBytes(final ARCRecord record) throws IOException { 064 ARCRecordMetaData meta = record.getMetaData(); 065 066 String metaline = 067 meta.getUrl() 068 + " " 069 + meta.getIp() 070 + " " 071 + meta.getDate() 072 + " " 073 + meta.getMimetype() 074 + " " 075 + (int) meta.getLength(); 076 String versionEtc = ""; 077 078 if (meta.getOffset() == 0) { 079 versionEtc = 080 "\n" 081 + meta.getVersion().replace(".", " ") 082 + " " 083 + meta.getOrigin() 084 + "\n" 085 + "URL IP-address Archive-date Content-type Archive-length"; 086 metaline += versionEtc; 087 } 088 089 ByteArrayOutputStream baos = new ByteArrayOutputStream(); 090 DataOutputStream dout = new DataOutputStream(baos); 091 dout.write(metaline.getBytes()); 092 dout.write("\n".getBytes()); 093 094 long recordLength = meta.getLength() - versionEtc.length(); 095 long len = IOUtils.copyLarge(new BoundedInputStream(record, recordLength), dout); 096 if (len != recordLength) { 097 LOG.error("Read " + len + " bytes but expected " + recordLength + " bytes. Continuing..."); 098 } 099 return baos.toByteArray(); 100 } 101 102 /** 103 * Extracts raw contents from an {@code ARCRecord} (including HTTP headers). 104 * 105 * @param record the {@code ARCRecord} 106 * @return raw contents 107 * @throws IOException if there is an issue 108 */ 109 public static byte[] getContent(final ARCRecord record) throws IOException { 110 ARCRecordMetaData meta = record.getMetaData(); 111 String versionEtc = ""; 112 113 if (meta.getOffset() == 0) { 114 versionEtc = 115 "\n" 116 + meta.getVersion().replace(".", " ") 117 + " " 118 + meta.getOrigin() 119 + "\n" 120 + "URL IP-address Archive-date Content-type Archive-length"; 121 } 122 123 try { 124 return copyToByteArray(record, (int) meta.getLength() - versionEtc.length(), true); 125 } catch (Exception e) { 126 // Catch exceptions related to any corrupt archive files. 127 return new byte[0]; 128 } 129 } 130 131 /** 132 * Extracts contents of the body from an {@code ARCRecord}. Excludes HTTP headers. 133 * 134 * @param record the {@code ARCRecord} 135 * @return contents of the body 136 * @throws IOException if there is an issue 137 */ 138 public static byte[] getBodyContent(final ARCRecord record) throws IOException { 139 byte[] raw = getContent(record); 140 int bodyOffset = record.getBodyOffset(); 141 142 byte[] content = null; 143 try { 144 content = new byte[raw.length - bodyOffset]; 145 System.arraycopy(raw, bodyOffset, content, 0, content.length); 146 } catch (java.lang.NegativeArraySizeException e) { 147 // To find out what URL causing the error: record.getMetaData().getUrl() 148 // For some records, we're missing the actual content data, likely due 149 // to a crawler gitch. Nothing much we can do, just swallow and move on. 150 content = new byte[0]; 151 } 152 return content; 153 } 154 155 /** 156 * Copies contents to a byte array. 157 * 158 * @param is raw input stream 159 * @param recordLength is length of a record 160 * @param enforceLength enforce the length 161 * @return rawContents of body 162 * @throws IOException if there is an issue 163 */ 164 private static byte[] copyToByteArray( 165 final InputStream is, final int recordLength, final boolean enforceLength) 166 throws IOException { 167 168 BoundedInputStream bis = new BoundedInputStream(is, recordLength); 169 byte[] rawContents = IOUtils.toByteArray(bis); 170 if (enforceLength && rawContents.length != recordLength) { 171 LOG.error( 172 "Read " 173 + rawContents.length 174 + " bytes but expected " 175 + recordLength 176 + " bytes. Continuing..."); 177 } 178 return rawContents; 179 } 180}