/** * Fills this corpus with documents created from files in a directory. * * @param filter the file filter used to select files from the target directory. If the filter is * <tt>null</tt> all the files will be accepted. * @param directory the directory from which the files will be picked. This parameter is an URL * for uniformity. It needs to be a URL of type file otherwise an InvalidArgumentException * will be thrown. An implementation for this method is provided as a static method at {@link * gate.corpora.CorpusImpl#populate(Corpus, URL, FileFilter, String, boolean)} . * @param encoding the encoding to be used for reading the documents * @param recurseDirectories should the directory be parsed recursively?. If <tt>true</tt> all the * files from the provided directory and all its children directories (on as many levels as * necessary) will be picked if accepted by the filter otherwise the children directories will * be ignored. */ @Override public void populate( URL directory, FileFilter filter, String encoding, String mimeType, boolean recurseDirectories) throws IOException, ResourceInstantiationException { CorpusImpl.populate(this, directory, filter, encoding, mimeType, recurseDirectories); }
/** * Fills the provided corpus with documents extracted from the provided single concatenated file. * * @param singleConcatenatedFile the single concatenated file. * @param documentRootElement content between the start and end of this element is considered for * documents. * @param encoding the encoding of the trec file. * @param numberOfFilesToExtract indicates the number of files to extract from the trecweb file. * @param documentNamePrefix the prefix to use for document names when creating from * @param mimeType the mime type which determines how the document is handled * @return total length of populated documents in the corpus in number of bytes */ @Override public long populate( URL singleConcatenatedFile, String documentRootElement, String encoding, int numberOfFilesToExtract, String documentNamePrefix, String mimeType, boolean includeRootElement) throws IOException, ResourceInstantiationException { return CorpusImpl.populate( this, singleConcatenatedFile, documentRootElement, encoding, numberOfFilesToExtract, documentNamePrefix, mimeType, includeRootElement); }