implicit class WARecordRDD extends Serializable
A Wrapper class around RDD to allow RDDs of type ArchiveRecord to be queried via a fluent API.
To load such an RDD, please see RecordLoader.
- Alphabetic
- By Inheritance
- WARecordRDD
- Serializable
- AnyRef
- Any
- Hide All
- Show All
- Public
- All
Instance Constructors
-  new WARecordRDD(rdd: RDD[ArchiveRecord])
Value Members
- 
      
      
      
        
      
    
      
        final 
        def
      
      
        !=(arg0: Any): Boolean
      
      
      - Definition Classes
- AnyRef → Any
 
- 
      
      
      
        
      
    
      
        final 
        def
      
      
        ##(): Int
      
      
      - Definition Classes
- AnyRef → Any
 
- 
      
      
      
        
      
    
      
        final 
        def
      
      
        ==(arg0: Any): Boolean
      
      
      - Definition Classes
- AnyRef → Any
 
-  def all(): DataFrame
- 
      
      
      
        
      
    
      
        final 
        def
      
      
        asInstanceOf[T0]: T0
      
      
      - Definition Classes
- Any
 
-  def audio(): DataFrame
- 
      
      
      
        
      
    
      
        
        def
      
      
        clone(): AnyRef
      
      
      - Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native() @HotSpotIntrinsicCandidate()
 
-  def css(): DataFrame
- 
      
      
      
        
      
    
      
        
        def
      
      
        discardContent(contentREs: Set[Regex]): RDD[ArchiveRecord]
      
      
      Filters detected content (regex). Filters detected content (regex). - contentREs
- a list of regular expressions 
 
- 
      
      
      
        
      
    
      
        
        def
      
      
        discardDate(dates: List[String], component: DateComponent = DateComponent.YYYYMMDD): RDD[ArchiveRecord]
      
      
      Filters detected dates. 
- 
      
      
      
        
      
    
      
        
        def
      
      
        discardDomains(urls: Set[String]): RDD[ArchiveRecord]
      
      
      Filters detected domains (regex). Filters detected domains (regex). - urls
- a list of urls for the source domains 
 
- 
      
      
      
        
      
    
      
        
        def
      
      
        discardHttpStatus(statusCodes: Set[String]): RDD[ArchiveRecord]
      
      
      Filters detected HTTP status codes. Filters detected HTTP status codes. - statusCodes
- a list of HTTP status codes 
 
- 
      
      
      
        
      
    
      
        
        def
      
      
        discardLanguages(lang: Set[String]): RDD[ArchiveRecord]
      
      
      Filters detected language. Filters detected language. - lang
- a set of ISO 639-2 codes 
 
- 
      
      
      
        
      
    
      
        
        def
      
      
        discardMimeTypes(mimeTypes: Set[String]): RDD[ArchiveRecord]
      
      
      Filters ArchiveRecord MimeTypes (web server). Filters ArchiveRecord MimeTypes (web server). - mimeTypes
- a list of Mime Types 
 
- 
      
      
      
        
      
    
      
        
        def
      
      
        discardMimeTypesTika(mimeTypes: Set[String]): RDD[ArchiveRecord]
      
      
      Filters detected MimeTypes (Tika). Filters detected MimeTypes (Tika). - mimeTypes
- a list of Mime Types 
 
- 
      
      
      
        
      
    
      
        
        def
      
      
        discardUrlPatterns(urlREs: Set[Regex]): RDD[ArchiveRecord]
      
      
      Filters detected URL patterns (regex). Filters detected URL patterns (regex). - urlREs
- a list of Regular expressions 
 
- 
      
      
      
        
      
    
      
        
        def
      
      
        discardUrls(urls: Set[String]): RDD[ArchiveRecord]
      
      
      Filters detected URLs. Filters detected URLs. - urls
- a list of urls 
 
- 
      
      
      
        
      
    
      
        final 
        def
      
      
        eq(arg0: AnyRef): Boolean
      
      
      - Definition Classes
- AnyRef
 
- 
      
      
      
        
      
    
      
        
        def
      
      
        equals(arg0: Any): Boolean
      
      
      - Definition Classes
- AnyRef → Any
 
- 
      
      
      
        
      
    
      
        final 
        def
      
      
        getClass(): Class[_]
      
      
      - Definition Classes
- AnyRef → Any
- Annotations
- @native() @HotSpotIntrinsicCandidate()
 
- 
      
      
      
        
      
    
      
        
        def
      
      
        hashCode(): Int
      
      
      - Definition Classes
- AnyRef → Any
- Annotations
- @native() @HotSpotIntrinsicCandidate()
 
-  def html(): DataFrame
-  def imagegraph(): DataFrame
-  def images(): DataFrame
- 
      
      
      
        
      
    
      
        final 
        def
      
      
        isInstanceOf[T0]: Boolean
      
      
      - Definition Classes
- Any
 
-  def js(): DataFrame
-  def json(): DataFrame
- 
      
      
      
        
      
    
      
        
        def
      
      
        keepContent(contentREs: Set[Regex]): RDD[ArchiveRecord]
      
      
      Removes all content that does not pass Regular Expression test. Removes all content that does not pass Regular Expression test. - contentREs
- a list of regular expressions to keep 
 
- 
      
      
      
        
      
    
      
        
        def
      
      
        keepDate(dates: List[String], component: DateComponent = DateComponent.YYYYMMDD): RDD[ArchiveRecord]
      
      
      Removes all data that does not have selected date. Removes all data that does not have selected date. - dates
- a list of dates 
- component
- the selected DateComponent enum value 
 
- 
      
      
      
        
      
    
      
        
        def
      
      
        keepDomains(urls: Set[String]): RDD[ArchiveRecord]
      
      
      Removes all data but selected source domains. Removes all data but selected source domains. - urls
- a list of urls for the source domains 
 
- 
      
      
      
        
      
    
      
        
        def
      
      
        keepHttpStatus(statusCodes: Set[String]): RDD[ArchiveRecord]
      
      
      Removes all data that does not have selected HTTP status codes. Removes all data that does not have selected HTTP status codes. - statusCodes
- a list of HTTP status codes 
 
- 
      
      
      
        
      
    
      
        
        def
      
      
        keepImages(): RDD[ArchiveRecord]
      
      
      Removes all data except images. 
- 
      
      
      
        
      
    
      
        
        def
      
      
        keepLanguages(lang: Set[String]): RDD[ArchiveRecord]
      
      
      Removes all data not in selected language. Removes all data not in selected language. - lang
- a set of ISO 639-2 codes 
 
- 
      
      
      
        
      
    
      
        
        def
      
      
        keepMimeTypes(mimeTypes: Set[String]): RDD[ArchiveRecord]
      
      
      Removes all data but selected mimeTypes specified. Removes all data but selected mimeTypes specified. - mimeTypes
- a list of Mime Types 
 
- 
      
      
      
        
      
    
      
        
        def
      
      
        keepMimeTypesTika(mimeTypes: Set[String]): RDD[ArchiveRecord]
      
      
      Removes all data but selected mimeTypes as detected by Tika. Removes all data but selected mimeTypes as detected by Tika. - mimeTypes
- a list of Mime Types 
 
- 
      
      
      
        
      
    
      
        
        def
      
      
        keepUrlPatterns(urlREs: Set[Regex]): RDD[ArchiveRecord]
      
      
      Removes all data but selected URL patterns. Removes all data but selected URL patterns. - urlREs
- a list of regular expressions 
 
- 
      
      
      
        
      
    
      
        
        def
      
      
        keepUrls(urls: Set[String]): RDD[ArchiveRecord]
      
      
      Removes all data but selected exact URLs. Removes all data but selected exact URLs. - urls
- a list of URLs to keep 
 
- 
      
      
      
        
      
    
      
        
        def
      
      
        keepValidPages(): RDD[ArchiveRecord]
      
      
      Removes all non-html-based data (images, executables, etc.) from html text. 
- 
      
      
      
        
      
    
      
        final 
        def
      
      
        ne(arg0: AnyRef): Boolean
      
      
      - Definition Classes
- AnyRef
 
- 
      
      
      
        
      
    
      
        final 
        def
      
      
        notify(): Unit
      
      
      - Definition Classes
- AnyRef
- Annotations
- @native() @HotSpotIntrinsicCandidate()
 
- 
      
      
      
        
      
    
      
        final 
        def
      
      
        notifyAll(): Unit
      
      
      - Definition Classes
- AnyRef
- Annotations
- @native() @HotSpotIntrinsicCandidate()
 
-  def pdfs(): DataFrame
-  def plainText(): DataFrame
-  def presentationProgramFiles(): DataFrame
- 
      
      
      
        
      
    
      
        
        def
      
      
        removeFiledesc(): RDD[ArchiveRecord]
      
      
      Filters out filedesc:// and dns: records. 
-  def spreadsheets(): DataFrame
- 
      
      
      
        
      
    
      
        final 
        def
      
      
        synchronized[T0](arg0: ⇒ T0): T0
      
      
      - Definition Classes
- AnyRef
 
- 
      
      
      
        
      
    
      
        
        def
      
      
        toString(): String
      
      
      - Definition Classes
- AnyRef → Any
 
-  def videos(): DataFrame
- 
      
      
      
        
      
    
      
        final 
        def
      
      
        wait(arg0: Long, arg1: Int): Unit
      
      
      - Definition Classes
- AnyRef
- Annotations
- @throws( ... )
 
- 
      
      
      
        
      
    
      
        final 
        def
      
      
        wait(arg0: Long): Unit
      
      
      - Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native()
 
- 
      
      
      
        
      
    
      
        final 
        def
      
      
        wait(): Unit
      
      
      - Definition Classes
- AnyRef
- Annotations
- @throws( ... )
 
- 
      
      
      
        
      
    
      
        
        def
      
      
        webgraph(): DataFrame
      
      
      Extracts a webgraph with columns for crawl date, source url, destination url, and anchor text. 
- 
      
      
      
        
      
    
      
        
        def
      
      
        webpages(): DataFrame
      
      
      Extracts webpages with columns for crawl data, url, MIME type, and content. 
-  def wordProcessorFiles(): DataFrame
-  def xml(): DataFrame
Deprecated Value Members
- 
      
      
      
        
      
    
      
        
        def
      
      
        finalize(): Unit
      
      
      - Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( classOf[java.lang.Throwable] ) @Deprecated @deprecated
- Deprecated
- (Since version ) see corresponding Javadoc for more information.