Пример #1
0
 /**
  * Fills this corpus with documents created from files in a directory.
  *
  * @param filter the file filter used to select files from the target directory. If the filter is
  *     <tt>null</tt> all the files will be accepted.
  * @param directory the directory from which the files will be picked. This parameter is an URL
  *     for uniformity. It needs to be a URL of type file otherwise an InvalidArgumentException
  *     will be thrown. An implementation for this method is provided as a static method at {@link
  *     gate.corpora.CorpusImpl#populate(Corpus, URL, FileFilter, String, boolean)} .
  * @param encoding the encoding to be used for reading the documents
  * @param recurseDirectories should the directory be parsed recursively?. If <tt>true</tt> all the
  *     files from the provided directory and all its children directories (on as many levels as
  *     necessary) will be picked if accepted by the filter otherwise the children directories will
  *     be ignored.
  */
 @Override
 public void populate(
     URL directory,
     FileFilter filter,
     String encoding,
     String mimeType,
     boolean recurseDirectories)
     throws IOException, ResourceInstantiationException {
   CorpusImpl.populate(this, directory, filter, encoding, mimeType, recurseDirectories);
 }
Пример #2
0
 /**
  * Fills the provided corpus with documents extracted from the provided single concatenated file.
  *
  * @param singleConcatenatedFile the single concatenated file.
  * @param documentRootElement content between the start and end of this element is considered for
  *     documents.
  * @param encoding the encoding of the trec file.
  * @param numberOfFilesToExtract indicates the number of files to extract from the trecweb file.
  * @param documentNamePrefix the prefix to use for document names when creating from
  * @param mimeType the mime type which determines how the document is handled
  * @return total length of populated documents in the corpus in number of bytes
  */
 @Override
 public long populate(
     URL singleConcatenatedFile,
     String documentRootElement,
     String encoding,
     int numberOfFilesToExtract,
     String documentNamePrefix,
     String mimeType,
     boolean includeRootElement)
     throws IOException, ResourceInstantiationException {
   return CorpusImpl.populate(
       this,
       singleConcatenatedFile,
       documentRootElement,
       encoding,
       numberOfFilesToExtract,
       documentNamePrefix,
       mimeType,
       includeRootElement);
 }