001/* 002 * Archives Unleashed Toolkit (AUT): 003 * An open-source platform for analyzing web archives. 004 * 005 * Licensed under the Apache License, Version 2.0 (the "License"); 006 * you may not use this file except in compliance with the License. 007 * You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package io.archivesunleashed.data; 019 020import org.apache.hadoop.io.Writable; 021import org.archive.io.ArchiveRecord; 022import org.archive.io.arc.ARCRecord; 023import org.archive.io.warc.WARCRecord; 024 025import java.io.DataInput; 026import java.io.DataOutput; 027import java.io.IOException; 028 029/** 030 * Implements Hadoop Writable for Archive Records. 031 */ 032public class ArchiveRecordWritable implements Writable { 033 034 /** 035 * Archive Formats that can be used. 036 * {@link #UNKNOWN} 037 * {@link #ARC} 038 * {@link #WARC} 039 */ 040 public enum ArchiveFormat { 041 /** 042 * UNKNOWN format. 043 */ 044 UNKNOWN, 045 046 /** 047 * ARC format. 048 */ 049 ARC, 050 051 /** 052 * WARC format. 053 */ 054 WARC 055 } 056 057 /** 058 * Set default Record format to UNKNOWN. 059 */ 060 private ArchiveFormat format = ArchiveFormat.UNKNOWN; 061 062 /** 063 * Initialize Archive Record to null. 064 */ 065 private ArchiveRecord record = null; 066 067 /** 068 * Utility function. 069 */ 070 public ArchiveRecordWritable() { 071 } 072 073 /** 074 * Initialize Archive Record. 075 * 076 * @param r Archive Record 077 */ 078 public ArchiveRecordWritable(final ArchiveRecord r) { 079 this.record = r; 080 detectFormat(); 081 } 082 083 /** 084 * Set Archive Record. 085 * 086 * @param r Archive Record 087 */ 088 public final void setRecord(final ArchiveRecord r) { 089 this.record = r; 090 detectFormat(); 091 } 092 093 /** 094 * Get Archive Record. 095 * 096 * @return record Archive Record 097 */ 098 public final ArchiveRecord getRecord() { 099 return record; 100 } 101 102 /** 103 * Detect format of Archive Record. 104 */ 105 public final void detectFormat() { 106 if (record instanceof ARCRecord) { 107 format = ArchiveFormat.ARC; 108 } else if (record instanceof WARCRecord) { 109 format = ArchiveFormat.WARC; 110 } else { 111 format = ArchiveFormat.UNKNOWN; 112 } 113 } 114 115 /** 116 * Get format of Archive Record. 117 * 118 * @return format of Archive Record 119 */ 120 public final ArchiveFormat getFormat() { 121 return format; 122 } 123 124 /** 125 * Set format of Archive Record. 126 * 127 * @param f format of Archive Record 128 */ 129 public final void setFormat(final ArchiveFormat f) { 130 this.format = f; 131 } 132 133 @Override 134 public final void readFields(final DataInput in) throws IOException { 135 int len = in.readInt(); 136 if (len == 0) { 137 this.record = null; 138 return; 139 } 140 141 byte[] bytes = new byte[len]; 142 in.readFully(bytes); 143 144 if (getFormat() == ArchiveFormat.ARC) { 145 this.record = ArcRecordUtils.fromBytes(bytes); 146 } else if (getFormat() == ArchiveFormat.WARC) { 147 this.record = WarcRecordUtils.fromBytes(bytes); 148 } else { 149 this.record = null; 150 } 151 } 152 153 @Override 154 public final void write(final DataOutput out) throws IOException { 155 if (record == null) { 156 out.writeInt(0); 157 } 158 byte[] bytes; 159 160 if (getFormat() == ArchiveFormat.ARC) { 161 bytes = ArcRecordUtils.toBytes((ARCRecord) record); 162 } else if (getFormat() == ArchiveFormat.WARC) { 163 bytes = WarcRecordUtils.toBytes((WARCRecord) record); 164 } else { 165 bytes = null; 166 } 167 168 out.writeInt(bytes.length); 169 out.write(bytes); 170 } 171}