implicit class WARecordRDD extends Serializable
A Wrapper class around RDD to allow RDDs of type ArchiveRecord to be queried via a fluent API.
To load such an RDD, please see RecordLoader.
- Alphabetic
- By Inheritance
- WARecordRDD
- Serializable
- AnyRef
- Any
- Hide All
- Show All
- Public
- All
Instance Constructors
- new WARecordRDD(rdd: RDD[ArchiveRecord])
Value Members
-
final
def
!=(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
final
def
##(): Int
- Definition Classes
- AnyRef → Any
-
final
def
==(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
- def all(): DataFrame
-
final
def
asInstanceOf[T0]: T0
- Definition Classes
- Any
- def audio(): DataFrame
-
def
clone(): AnyRef
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native() @HotSpotIntrinsicCandidate()
- def css(): DataFrame
-
def
discardContent(contentREs: Set[Regex]): RDD[ArchiveRecord]
Filters detected content (regex).
Filters detected content (regex).
- contentREs
a list of regular expressions
-
def
discardDate(dates: List[String], component: DateComponent = DateComponent.YYYYMMDD): RDD[ArchiveRecord]
Filters detected dates.
-
def
discardDomains(urls: Set[String]): RDD[ArchiveRecord]
Filters detected domains (regex).
Filters detected domains (regex).
- urls
a list of urls for the source domains
-
def
discardHttpStatus(statusCodes: Set[String]): RDD[ArchiveRecord]
Filters detected HTTP status codes.
Filters detected HTTP status codes.
- statusCodes
a list of HTTP status codes
-
def
discardLanguages(lang: Set[String]): RDD[ArchiveRecord]
Filters detected language.
Filters detected language.
- lang
a set of ISO 639-2 codes
-
def
discardMimeTypes(mimeTypes: Set[String]): RDD[ArchiveRecord]
Filters ArchiveRecord MimeTypes (web server).
Filters ArchiveRecord MimeTypes (web server).
- mimeTypes
a list of Mime Types
-
def
discardMimeTypesTika(mimeTypes: Set[String]): RDD[ArchiveRecord]
Filters detected MimeTypes (Tika).
Filters detected MimeTypes (Tika).
- mimeTypes
a list of Mime Types
-
def
discardUrlPatterns(urlREs: Set[Regex]): RDD[ArchiveRecord]
Filters detected URL patterns (regex).
Filters detected URL patterns (regex).
- urlREs
a list of Regular expressions
-
def
discardUrls(urls: Set[String]): RDD[ArchiveRecord]
Filters detected URLs.
Filters detected URLs.
- urls
a list of urls
-
final
def
eq(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
-
def
equals(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
final
def
getClass(): Class[_]
- Definition Classes
- AnyRef → Any
- Annotations
- @native() @HotSpotIntrinsicCandidate()
-
def
hashCode(): Int
- Definition Classes
- AnyRef → Any
- Annotations
- @native() @HotSpotIntrinsicCandidate()
- def html(): DataFrame
- def imagegraph(): DataFrame
- def images(): DataFrame
-
final
def
isInstanceOf[T0]: Boolean
- Definition Classes
- Any
- def js(): DataFrame
- def json(): DataFrame
-
def
keepContent(contentREs: Set[Regex]): RDD[ArchiveRecord]
Removes all content that does not pass Regular Expression test.
Removes all content that does not pass Regular Expression test.
- contentREs
a list of regular expressions to keep
-
def
keepDate(dates: List[String], component: DateComponent = DateComponent.YYYYMMDD): RDD[ArchiveRecord]
Removes all data that does not have selected date.
Removes all data that does not have selected date.
- dates
a list of dates
- component
the selected DateComponent enum value
-
def
keepDomains(urls: Set[String]): RDD[ArchiveRecord]
Removes all data but selected source domains.
Removes all data but selected source domains.
- urls
a list of urls for the source domains
-
def
keepHttpStatus(statusCodes: Set[String]): RDD[ArchiveRecord]
Removes all data that does not have selected HTTP status codes.
Removes all data that does not have selected HTTP status codes.
- statusCodes
a list of HTTP status codes
-
def
keepImages(): RDD[ArchiveRecord]
Removes all data except images.
-
def
keepLanguages(lang: Set[String]): RDD[ArchiveRecord]
Removes all data not in selected language.
Removes all data not in selected language.
- lang
a set of ISO 639-2 codes
-
def
keepMimeTypes(mimeTypes: Set[String]): RDD[ArchiveRecord]
Removes all data but selected mimeTypes specified.
Removes all data but selected mimeTypes specified.
- mimeTypes
a list of Mime Types
-
def
keepMimeTypesTika(mimeTypes: Set[String]): RDD[ArchiveRecord]
Removes all data but selected mimeTypes as detected by Tika.
Removes all data but selected mimeTypes as detected by Tika.
- mimeTypes
a list of Mime Types
-
def
keepUrlPatterns(urlREs: Set[Regex]): RDD[ArchiveRecord]
Removes all data but selected URL patterns.
Removes all data but selected URL patterns.
- urlREs
a list of regular expressions
-
def
keepUrls(urls: Set[String]): RDD[ArchiveRecord]
Removes all data but selected exact URLs.
Removes all data but selected exact URLs.
- urls
a list of URLs to keep
-
def
keepValidPages(): RDD[ArchiveRecord]
Removes all non-html-based data (images, executables, etc.) from html text.
-
final
def
ne(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
-
final
def
notify(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native() @HotSpotIntrinsicCandidate()
-
final
def
notifyAll(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native() @HotSpotIntrinsicCandidate()
- def pdfs(): DataFrame
- def plainText(): DataFrame
- def presentationProgramFiles(): DataFrame
-
def
removeFiledesc(): RDD[ArchiveRecord]
Filters out filedesc:// and dns: records.
- def spreadsheets(): DataFrame
-
final
def
synchronized[T0](arg0: ⇒ T0): T0
- Definition Classes
- AnyRef
-
def
toString(): String
- Definition Classes
- AnyRef → Any
- def videos(): DataFrame
-
final
def
wait(arg0: Long, arg1: Int): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
-
final
def
wait(arg0: Long): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native()
-
final
def
wait(): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
-
def
webgraph(): DataFrame
Extracts a webgraph with columns for crawl date, source url, destination url, and anchor text.
-
def
webpages(): DataFrame
Extracts webpages with columns for crawl data, url, MIME type, and content.
- def wordProcessorFiles(): DataFrame
- def xml(): DataFrame
Deprecated Value Members
-
def
finalize(): Unit
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( classOf[java.lang.Throwable] ) @Deprecated @deprecated
- Deprecated
(Since version ) see corresponding Javadoc for more information.