@Override protected void processSubDataEntity( MultiValueHashMap<String, Object> subDataEntityInformation, Metadata metadata, ContentHandler handler2use4recursiveCall, ParseContext context) throws Exception { URLName urlNameWithPassword = (URLName) subDataEntityInformation.getFirst("urlNameWithPassword"); String strMessageId = (String) subDataEntityInformation.getFirst("Message-ID"); String strMessageFolder = (String) subDataEntityInformation.getFirst("folder"); String strEntityId = ImapURLStreamProvider.getEntityId(strMessageFolder, strMessageId); // wir setzten die hier schon mal - die Daten haben wir in einem prefetching-Schritt schon // effizient geladen. Wenn diese hier schon im // Metadata-Objekt stehen, werden sie von der addFirstMetadata nicht nochmal geladen metadata.set(Metadata.SOURCE, urlNameWithPassword.toString()); metadata.set(IncrementalCrawlingHistory.dataEntityId, strEntityId); metadata.set( IncrementalCrawlingHistory.dataEntityContentFingerprint, ImapURLStreamProvider.getDataEntityContentFingerprint(strEntityId)); URLName urlNameWithoutPassword = new URLName( urlNameWithPassword.getProtocol(), urlNameWithPassword.getHost(), urlNameWithPassword.getPort(), urlNameWithPassword.getFile(), urlNameWithPassword.getUsername(), ""); metadata.set(Metadata.RESOURCE_NAME_KEY, urlNameWithoutPassword.toString()); if (strMessageId == null) metadata.set("Content-Type", DatasourceMediaTypes.IMAPFOLDER.toString()); else metadata.set("Content-Type", "message/rfc822"); metadata = URLStreamProvider.getURLStreamProvider4Protocol(urlNameWithPassword.getProtocol()) .addFirstMetadata(urlNameWithPassword, metadata, context); InputStream stream = URLStreamProvider.getURLStreamProvider(urlNameWithPassword) .getStream(urlNameWithPassword, metadata, context); try { if (m_leech == null) m_leech = new Leech(); // hier nimmt der dann bei einer message hoffentlich den Tika RFC822Parser Parser parser = m_leech.getParser(); parser.parse(stream, handler2use4recursiveCall, metadata, context); } finally { if (stream != null) stream.close(); } }
@Override public TikaInputStream getStream( URLName url2getStream, Metadata metadata, ParseContext parseContext) throws Exception { final URL asUrl = new URL(url2getStream.toString()); return TikaInputStream.get( new ShiftInitInputStream() { @Override protected InputStream initBeforeFirstStreamDataAccess() throws Exception { URLConnection connection = asUrl.openConnection(); connection.setConnectTimeout(connectTimeout); connection.setReadTimeout(readTimeout); connection.setRequestProperty("Accept-Encoding", "gzip"); InputStream ourStream = connection.getInputStream(); String strContentEncoding = connection.getHeaderField("Content-Encoding"); if (strContentEncoding != null) strContentEncoding = strContentEncoding.toLowerCase().trim(); if ("gzip".equals(strContentEncoding)) ourStream = new BufferedInputStream(new GZIPInputStream(ourStream)); else ourStream = new BufferedInputStream(ourStream); return ourStream; } }); }
public void testGetFolder() { try { Folder f = store.getFolder(url.getFile()); assertNotNull(f); } catch (MessagingException e) { fail(e.getMessage()); } }
/** We override toString() so we can display the store URLName without the password. */ public String toString() { if (display == null) { URLName url = store.getURLName(); if (url == null) { display = store.toString(); } else { // don't show the password URLName too = new URLName( url.getProtocol(), url.getHost(), url.getPort(), url.getFile(), url.getUsername(), null); display = too.toString(); } } return display; }
public static Store connect2Server(URLName url, ParseContext context) throws MessagingException { ImapCrawlerContext imapCrawlerContext = context.get(ImapCrawlerContext.class, new ImapCrawlerContext()); Properties properties = System.getProperties(); properties.setProperty("mail.store.protocol", url.getProtocol()); if (imapCrawlerContext.getIgnoreSSLCertificates()) { properties.setProperty( "mail.imaps.socketFactory.class", CertificateIgnoringSocketFactory.class.getName()); properties.setProperty("mail.imaps.socketFactory.fallback", "false"); } if (!StringUtils.nullOrWhitespace(imapCrawlerContext.getSSLCertificateFilePath()) && "imaps".equalsIgnoreCase(url.getProtocol())) { properties.setProperty( "javax.net.ssl.trustStore", imapCrawlerContext.getSSLCertificateFilePath()); properties.setProperty( "javax.net.ssl.trustStorePassword", imapCrawlerContext.getSSLCertificateFilePassword()); } Session session = Session.getDefaultInstance(properties); Store mailStore = session.getStore(url.getProtocol()); String strUserName = imapCrawlerContext.getUserName(); if (strUserName == null) strUserName = url.getUsername(); String strPassword = imapCrawlerContext.getPassword(); if (strPassword == null) strPassword = url.getPassword(); if (!mailStore.isConnected()) mailStore.connect(url.getHost(), url.getPort(), strUserName, strPassword); return mailStore; }
/** * Adds first metadata and metadata relevant for incremental indexing to the given metadata object * * @param url2getMetadata the url for which metadata should be extracte * @param metadata2fill the metadata object. The method will put several entries, as * Metadata.SOURCE, Metadata.RESOURCE_NAME_KEY, Metadata.CONTENT_ENCODING, * Metadata.CONTENT_TYPE, Metadata.CONTENT_LOCATION and, last but not least, the {@link * IncrementalCrawlingHistory#dataEntityExistsID} and {@link * IncrementalCrawlingHistory#dataEntityContentFingerprint} to determine whether the content * behind the url was modified since the last crawl or not. The URL path entry for * Metadata.SOURCE is the last URL behind potential previous redirects (in the case its an * http connection). The origin URL will be written into an attribute "originalsource" in the * case it differs from the one into Metadata.SOURCE. To determine whether an url was modified * or not, the method needs a configured crawling history. * @param parseContext the parsing context to specify a crawling history. Can be null, in this * case no history will be used (of course ;) ) * @return the metadata object, enriched with new metadata (in the case this metadata was not set * yet) */ @Override public Metadata addFirstMetadata( URLName url2getMetadata, Metadata metadata2fill, ParseContext parseContext) throws Exception { if (metadata2fill == null) metadata2fill = new Metadata(); // wenn das Teil schon gefüllt ist, dann machen wir gar nix if (!(metadata2fill.get(Metadata.SOURCE) == null || metadata2fill.get(Metadata.RESOURCE_NAME_KEY) == null || metadata2fill.get(Metadata.CONTENT_ENCODING) == null || metadata2fill.get(Metadata.CONTENT_TYPE) == null || metadata2fill.get(Metadata.CONTENT_LOCATION) == null || metadata2fill.get(IncrementalCrawlingHistory.dataEntityContentFingerprint) == null || metadata2fill.get(IncrementalCrawlingHistory.dataEntityExistsID) == null)) { // alle sind bereits gesetzt return metadata2fill; } IncrementalCrawlingHistory crawlingHistory = null; if (parseContext == null) parseContext = new ParseContext(); CrawlerContext crawlerContext = parseContext.get(CrawlerContext.class, new CrawlerContext()); crawlingHistory = crawlerContext.getIncrementalCrawlingHistory(); // müssen wir hier evtl. die Lucene-Teile auch wieder closen? das ist immerhin eine // utility-Methode^^ och - wir haben ja auch noch nen // shutdown hook, und nach dem crawl wirds eh geschlossen. Klingt safe if (crawlingHistory != null) crawlingHistory.openLuceneStuff(); // keep a backup of the originally passed url String strOriginalUrlString = url2getMetadata.toString(); metadata2fill.set(Metadata.SOURCE, strOriginalUrlString); URLConnection connection = null; int nrRedirections = 0; String strCurrentUrl = url2getMetadata.toString(); // We're going to loop, accessing urls until we arrive at a url that is not redirected. The // redirection is followed manually rather than automatically, which is HttpURLConnection's // default behaviour, so that we know the actual url we arrive at. while (true) { // check if we haven't been redirected too often if (nrRedirections > MAX_REDIRECTIONS) { throw new IOException( "too many redirections, max = " + MAX_REDIRECTIONS + ", url = " + strOriginalUrlString); } // normalize the URL URL currentUrl = new URL(strCurrentUrl); currentUrl = new URL(UrlUtil.normalizeURL(new URLName(currentUrl)).toString()); strCurrentUrl = currentUrl.toExternalForm(); // see if a date was registered for this url Date ifModifiedSinceDate = null; if (crawlingHistory != null) { String lastIfModifiedSinceDate = crawlingHistory.getDataEntityContentFingerprint(strCurrentUrl); if (lastIfModifiedSinceDate != null) ifModifiedSinceDate = new Date(Long.valueOf(lastIfModifiedSinceDate)); } try { // maybe there exists other connections as http - in this case we want to fall back zu // standard Tika behaviour connection = currentUrl.openConnection(); if (!(connection instanceof HttpURLConnection)) break; connection.setConnectTimeout(connectTimeout); connection.setReadTimeout(readTimeout); connection.setRequestProperty("Accept-Encoding", "gzip"); ((HttpURLConnection) connection).setInstanceFollowRedirects(false); if (ifModifiedSinceDate != null) { connection.setIfModifiedSince(ifModifiedSinceDate.getTime()); } // send the request to the server connection.connect(); } catch (Exception e) { // I've seen IllegalArgumentExceptions in the sun.net classes here because of some freaky // URLs // that did not generate MalformedUrlExceptions, so therefore a "catch "Exception" to be // sure if (e instanceof IOException) { throw (IOException) e; } else { throw new LeechException( "connection to " + strOriginalUrlString + " resulted in an exception", e); } } // check for http-specific response codes int responseCode = ((HttpURLConnection) connection).getResponseCode(); if (isRedirected(responseCode)) { // follow the redirected url String lastUrl = strCurrentUrl; strCurrentUrl = getRedirectedUrl(currentUrl, connection); nrRedirections++; // check for urls that redirect to themselves if (strCurrentUrl.equals(lastUrl)) { throw new LeechException("url redirects to itself: " + strCurrentUrl); } } else if (responseCode == HttpURLConnection.HTTP_NOT_FOUND) { throw new LeechException(strCurrentUrl + "not found"); } else if (responseCode == HttpURLConnection.HTTP_NOT_MODIFIED) { // des isch nicht modifiziert seit dem letzten crawl - wir geben die ('modification') time // des letzten crawls zurück, damit des teil // als unmodifiziert erkannt wird. if (crawlingHistory != null && ifModifiedSinceDate != null) metadata2fill.set( IncrementalCrawlingHistory.dataEntityContentFingerprint, String.valueOf(ifModifiedSinceDate.getTime())); break; } else if (responseCode != HttpURLConnection.HTTP_OK) { // this is a communication error, quit with an exception throw new IOException( "Http connection error, response code = " + responseCode + ", url = " + currentUrl); } else { // we're done break; } } if (metadata2fill.get(IncrementalCrawlingHistory.dataEntityContentFingerprint) == null) metadata2fill.set( IncrementalCrawlingHistory.dataEntityContentFingerprint, String.valueOf(System.currentTimeMillis())); // die Einträge, die Tika auch in das metadata einträgt, und noch etwas dazu metadata2fill.set(Metadata.RESOURCE_NAME_KEY, strCurrentUrl); metadata2fill.set(Metadata.SOURCE, strCurrentUrl); metadata2fill.set(IncrementalCrawlingHistory.dataEntityExistsID, strCurrentUrl); if (strOriginalUrlString.indexOf(strCurrentUrl) == -1) metadata2fill.set("originalsource", strOriginalUrlString); String type = connection.getContentType(); // text/xml is far too general to select the right parser if (type != null && !type.contains("text/xml")) metadata2fill.set(Metadata.CONTENT_TYPE, type); String encoding = connection.getContentEncoding(); if (encoding != null) metadata2fill.set(Metadata.CONTENT_ENCODING, encoding); int length = connection.getContentLength(); if (length >= 0) metadata2fill.set(Metadata.CONTENT_LENGTH, Integer.toString(length)); // das brauchen wir noch, um relative links aufzulösen metadata2fill.set(Metadata.CONTENT_LOCATION, strCurrentUrl); return metadata2fill; }
public Transport getTransport(URLName var1) throws NoSuchProviderException { String var2 = var1.getProtocol(); Provider var3 = this.getProvider(var2); return this.getTransport(var3, var1); }
public Store getStore(URLName var1) throws NoSuchProviderException { String var2 = var1.getProtocol(); Provider var3 = this.getProvider(var2); return this.getStore(var3, var1); }
@Override protected Iterator<MultiValueHashMap<String, Object>> getSubDataEntitiesInformation( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws Exception { // imap url schema: imap[s]://uname@hostname:port/folder;uidvalidity=385759045/;uid=20. Examples // (incl. message-referenzierung) // http://xml.resource.org/public/rfc/html/rfc2192.html#anchor10 // allerdings nimmt der Java ImapStore auch URLs mit Passwörtern an. Dann geht auch // imap[s]://uname:pwd@hostname:port/folder;uidvalidity=385759045/;uid=20 CrawlerContext crawlerContext = context.get(CrawlerContext.class, new CrawlerContext()); String strContainerURL = metadata.get(Metadata.SOURCE); URLName containerURLName = new URLName(strContainerURL); if (m_mailStore == null) m_mailStore = connect2Server(containerURLName, context); // wenn kein directory angegeben wird, dann crawlen wir einfach den default folder und die inbox LinkedList<Folder> llFolderz2Crawl = new LinkedList<Folder>(); if (containerURLName.getFile() != null) { Folder folder = m_mailStore.getFolder(containerURLName.getFile()); if (folder != null && folder.exists()) llFolderz2Crawl.add(folder); else throw new FileNotFoundException("Can't find imap folder '" + folder.getFullName() + "'"); } else { Folder folder = m_mailStore.getDefaultFolder(); if (folder != null && folder.exists()) llFolderz2Crawl.add(folder); folder = m_mailStore.getFolder("INBOX"); if (folder != null && folder.exists()) llFolderz2Crawl.add(folder); } LinkedList<MultiValueHashMap<String, Object>> llEntityInfo = new LinkedList<MultiValueHashMap<String, Object>>(); for (Folder folder2crawl : llFolderz2Crawl) { // Jetzt haben wir die Containerobjekte - nun geben wir die Daten zu den SubEntities zurück // die subfolder boolean bFolderCanHaveSubFolders = (folder2crawl.getType() & Folder.HOLDS_FOLDERS) == Folder.HOLDS_FOLDERS; if (bFolderCanHaveSubFolders) { folder2crawl.open(Folder.READ_ONLY); Folder[] subFolders = folder2crawl.list(); for (Folder subFolder : subFolders) { URLName urlName = subFolder.getURLName(); URLName urlNameWithPassword = new URLName( containerURLName.getProtocol(), urlName.getHost(), urlName.getPort(), urlName.getFile(), urlName.getUsername(), containerURLName.getPassword()); if (!checkIfInConstraints(urlName.toString(), null, context)) continue; MultiValueHashMap<String, Object> hsEntityInformation = new MultiValueHashMap<String, Object>(); hsEntityInformation.add(CrawlerParser.SOURCEID, urlName); hsEntityInformation.add("urlNameWithPassword", urlNameWithPassword); hsEntityInformation.add("folder", subFolder.getFullName()); llEntityInfo.add(hsEntityInformation); } } // die messages boolean bFolderCanHaveMessages = (folder2crawl.getType() & Folder.HOLDS_MESSAGES) == Folder.HOLDS_MESSAGES; if (bFolderCanHaveMessages) { if (!folder2crawl.isOpen()) folder2crawl.open(Folder.READ_ONLY); // wir holen uns alle nicht-deleted messages, und werfen noch die raus, die 'expunged' sind Message[] relevantMessagesOfFolder = folder2crawl.search(new FlagTerm(new Flags(Flags.Flag.DELETED), false)); ArrayList<Message> nonDelNonExpungedMessages = new ArrayList<Message>(); for (Message message : relevantMessagesOfFolder) if (!message.isExpunged()) nonDelNonExpungedMessages.add(message); relevantMessagesOfFolder = nonDelNonExpungedMessages.toArray(new Message[0]); // die Daten die wir später benötigen holen wir uns effizient in einem Rutsch - deswegen // benötigen wir auch keinen Thread mit dem // OneAfterOneIterator, um Speicher zu sparen (siehe DirectoryCrawlerParser). Das Array // haben wir hier eh. Entweder oder. FetchProfile profile = new FetchProfile(); profile.add(UIDFolder.FetchProfileItem.UID); profile.add("Message-ID"); folder2crawl.fetch(relevantMessagesOfFolder, profile); for (int i = 0; i < relevantMessagesOfFolder.length && !crawlerContext.stopRequested(); i++) { MimeMessage message = (MimeMessage) relevantMessagesOfFolder[i]; // hier brauchen wir noch eine URL mit und eine ohne Passwort URLName urlName = getMessageUrl(folder2crawl, message); URLName urlNameWithPassword = new URLName( containerURLName.getProtocol(), urlName.getHost(), urlName.getPort(), urlName.getFile(), urlName.getUsername(), containerURLName.getPassword()); if (!checkIfInConstraints(urlName.toString(), message, context)) continue; MultiValueHashMap<String, Object> hsEntityInformation = new MultiValueHashMap<String, Object>(); hsEntityInformation.add(CrawlerParser.SOURCEID, urlName); hsEntityInformation.add("urlNameWithPassword", urlNameWithPassword); hsEntityInformation.add("Message-ID", message.getHeader("Message-ID")[0]); hsEntityInformation.add("folder", folder2crawl.getFullName()); llEntityInfo.add(hsEntityInformation); } } // wir haben die folder abgearbeitet, dann können wir diesen Speicher wieder frei geben m_hsImapFolder2Stickyness.clear(); if (folder2crawl.isOpen()) folder2crawl.close(false); } return llEntityInfo.iterator(); }