001/* 002 * Archives Unleashed Toolkit (AUT): 003 * An open-source platform for analyzing web archives. 004 * 005 * Licensed under the Apache License, Version 2.0 (the "License"); 006 * you may not use this file except in compliance with the License. 007 * You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package io.archivesunleashed.io; 019 020import io.archivesunleashed.data.ArcRecordUtils; 021import io.archivesunleashed.data.WarcRecordUtils; 022import java.io.DataInput; 023import java.io.DataOutput; 024import java.io.IOException; 025import org.apache.hadoop.io.Writable; 026import org.archive.io.ArchiveRecord; 027import org.archive.io.arc.ARCRecord; 028import org.archive.io.warc.WARCRecord; 029 030/** 031 * Implements Hadoop Writable for Archive Records. 032 */ 033public class ArchiveRecordWritable implements Writable { 034 035 /** 036 * Archive Formats that can be used. 037 * {@link #UNKNOWN} 038 * {@link #ARC} 039 * {@link #WARC} 040 */ 041 public enum ArchiveFormat { 042 /** 043 * UNKNOWN format. 044 */ 045 UNKNOWN, 046 047 /** 048 * ARC format. 049 */ 050 ARC, 051 052 /** 053 * WARC format. 054 */ 055 WARC 056 } 057 058 /** 059 * Set default Record format to UNKNOWN. 060 */ 061 private ArchiveFormat format = ArchiveFormat.UNKNOWN; 062 063 /** 064 * Initialize Archive Record to null. 065 */ 066 private ArchiveRecord record = null; 067 068 /** 069 * Utility function. 070 */ 071 public ArchiveRecordWritable() { 072 } 073 074 /** 075 * Initialize Archive Record. 076 * 077 * @param r Archive Record 078 */ 079 public ArchiveRecordWritable(final ArchiveRecord r) { 080 this.record = r; 081 detectFormat(); 082 } 083 084 /** 085 * Set Archive Record. 086 * 087 * @param r Archive Record 088 */ 089 public final void setRecord(final ArchiveRecord r) { 090 this.record = r; 091 detectFormat(); 092 } 093 094 /** 095 * Get Archive Record. 096 * 097 * @return record Archive Record 098 */ 099 public final ArchiveRecord getRecord() { 100 return record; 101 } 102 103 /** 104 * Detect format of Archive Record. 105 */ 106 public final void detectFormat() { 107 if (record instanceof ARCRecord) { 108 format = ArchiveFormat.ARC; 109 } else if (record instanceof WARCRecord) { 110 format = ArchiveFormat.WARC; 111 } else { 112 format = ArchiveFormat.UNKNOWN; 113 } 114 } 115 116 /** 117 * Get format of Archive Record. 118 * 119 * @return format of Archive Record 120 */ 121 public final ArchiveFormat getFormat() { 122 return format; 123 } 124 125 /** 126 * Set format of Archive Record. 127 * 128 * @param f format of Archive Record 129 */ 130 public final void setFormat(final ArchiveFormat f) { 131 this.format = f; 132 } 133 134 @Override 135 public final void readFields(final DataInput in) throws IOException { 136 int len = in.readInt(); 137 if (len == 0) { 138 this.record = null; 139 return; 140 } 141 142 byte[] bytes = new byte[len]; 143 in.readFully(bytes); 144 145 if (getFormat() == ArchiveFormat.ARC) { 146 this.record = ArcRecordUtils.fromBytes(bytes); 147 } else if (getFormat() == ArchiveFormat.WARC) { 148 this.record = WarcRecordUtils.fromBytes(bytes); 149 } else { 150 this.record = null; 151 } 152 } 153 154 @Override 155 public final void write(final DataOutput out) throws IOException { 156 if (record == null) { 157 out.writeInt(0); 158 } 159 byte[] bytes; 160 161 if (getFormat() == ArchiveFormat.ARC) { 162 bytes = ArcRecordUtils.toBytes((ARCRecord) record); 163 } else if (getFormat() == ArchiveFormat.WARC) { 164 bytes = WarcRecordUtils.toBytes((WARCRecord) record); 165 } else { 166 bytes = null; 167 } 168 169 out.writeInt(bytes.length); 170 out.write(bytes); 171 } 172}