Пример #1
0
  @Override
  protected void processSubDataEntity(
      MultiValueHashMap<String, Object> subDataEntityInformation,
      Metadata metadata,
      ContentHandler handler2use4recursiveCall,
      ParseContext context)
      throws Exception {

    URLName urlNameWithPassword =
        (URLName) subDataEntityInformation.getFirst("urlNameWithPassword");

    String strMessageId = (String) subDataEntityInformation.getFirst("Message-ID");
    String strMessageFolder = (String) subDataEntityInformation.getFirst("folder");

    String strEntityId = ImapURLStreamProvider.getEntityId(strMessageFolder, strMessageId);

    // wir setzten die hier schon mal - die Daten haben wir in einem prefetching-Schritt schon
    // effizient geladen. Wenn diese hier schon im
    // Metadata-Objekt stehen, werden sie von der addFirstMetadata nicht nochmal geladen
    metadata.set(Metadata.SOURCE, urlNameWithPassword.toString());
    metadata.set(IncrementalCrawlingHistory.dataEntityId, strEntityId);
    metadata.set(
        IncrementalCrawlingHistory.dataEntityContentFingerprint,
        ImapURLStreamProvider.getDataEntityContentFingerprint(strEntityId));
    URLName urlNameWithoutPassword =
        new URLName(
            urlNameWithPassword.getProtocol(),
            urlNameWithPassword.getHost(),
            urlNameWithPassword.getPort(),
            urlNameWithPassword.getFile(),
            urlNameWithPassword.getUsername(),
            "");
    metadata.set(Metadata.RESOURCE_NAME_KEY, urlNameWithoutPassword.toString());
    if (strMessageId == null)
      metadata.set("Content-Type", DatasourceMediaTypes.IMAPFOLDER.toString());
    else metadata.set("Content-Type", "message/rfc822");

    metadata =
        URLStreamProvider.getURLStreamProvider4Protocol(urlNameWithPassword.getProtocol())
            .addFirstMetadata(urlNameWithPassword, metadata, context);
    InputStream stream =
        URLStreamProvider.getURLStreamProvider(urlNameWithPassword)
            .getStream(urlNameWithPassword, metadata, context);

    try {

      if (m_leech == null) m_leech = new Leech();

      // hier nimmt der dann bei einer message hoffentlich den Tika RFC822Parser
      Parser parser = m_leech.getParser();

      parser.parse(stream, handler2use4recursiveCall, metadata, context);

    } finally {
      if (stream != null) stream.close();
    }
  }
  @Override
  public TikaInputStream getStream(
      URLName url2getStream, Metadata metadata, ParseContext parseContext) throws Exception {
    final URL asUrl = new URL(url2getStream.toString());

    return TikaInputStream.get(
        new ShiftInitInputStream() {
          @Override
          protected InputStream initBeforeFirstStreamDataAccess() throws Exception {
            URLConnection connection = asUrl.openConnection();

            connection.setConnectTimeout(connectTimeout);
            connection.setReadTimeout(readTimeout);
            connection.setRequestProperty("Accept-Encoding", "gzip");

            InputStream ourStream = connection.getInputStream();

            String strContentEncoding = connection.getHeaderField("Content-Encoding");
            if (strContentEncoding != null)
              strContentEncoding = strContentEncoding.toLowerCase().trim();

            if ("gzip".equals(strContentEncoding))
              ourStream = new BufferedInputStream(new GZIPInputStream(ourStream));
            else ourStream = new BufferedInputStream(ourStream);

            return ourStream;
          }
        });
  }
Пример #3
0
 public void testGetFolder() {
   try {
     Folder f = store.getFolder(url.getFile());
     assertNotNull(f);
   } catch (MessagingException e) {
     fail(e.getMessage());
   }
 }
  /** We override toString() so we can display the store URLName without the password. */
  public String toString() {
    if (display == null) {
      URLName url = store.getURLName();
      if (url == null) {
        display = store.toString();
      } else {
        // don't show the password
        URLName too =
            new URLName(
                url.getProtocol(),
                url.getHost(),
                url.getPort(),
                url.getFile(),
                url.getUsername(),
                null);
        display = too.toString();
      }
    }

    return display;
  }
Пример #5
0
  public static Store connect2Server(URLName url, ParseContext context) throws MessagingException {

    ImapCrawlerContext imapCrawlerContext =
        context.get(ImapCrawlerContext.class, new ImapCrawlerContext());

    Properties properties = System.getProperties();

    properties.setProperty("mail.store.protocol", url.getProtocol());

    if (imapCrawlerContext.getIgnoreSSLCertificates()) {
      properties.setProperty(
          "mail.imaps.socketFactory.class", CertificateIgnoringSocketFactory.class.getName());
      properties.setProperty("mail.imaps.socketFactory.fallback", "false");
    }

    if (!StringUtils.nullOrWhitespace(imapCrawlerContext.getSSLCertificateFilePath())
        && "imaps".equalsIgnoreCase(url.getProtocol())) {
      properties.setProperty(
          "javax.net.ssl.trustStore", imapCrawlerContext.getSSLCertificateFilePath());
      properties.setProperty(
          "javax.net.ssl.trustStorePassword", imapCrawlerContext.getSSLCertificateFilePassword());
    }

    Session session = Session.getDefaultInstance(properties);
    Store mailStore = session.getStore(url.getProtocol());

    String strUserName = imapCrawlerContext.getUserName();
    if (strUserName == null) strUserName = url.getUsername();

    String strPassword = imapCrawlerContext.getPassword();
    if (strPassword == null) strPassword = url.getPassword();

    if (!mailStore.isConnected())
      mailStore.connect(url.getHost(), url.getPort(), strUserName, strPassword);

    return mailStore;
  }
  /**
   * Adds first metadata and metadata relevant for incremental indexing to the given metadata object
   *
   * @param url2getMetadata the url for which metadata should be extracte
   * @param metadata2fill the metadata object. The method will put several entries, as
   *     Metadata.SOURCE, Metadata.RESOURCE_NAME_KEY, Metadata.CONTENT_ENCODING,
   *     Metadata.CONTENT_TYPE, Metadata.CONTENT_LOCATION and, last but not least, the {@link
   *     IncrementalCrawlingHistory#dataEntityExistsID} and {@link
   *     IncrementalCrawlingHistory#dataEntityContentFingerprint} to determine whether the content
   *     behind the url was modified since the last crawl or not. The URL path entry for
   *     Metadata.SOURCE is the last URL behind potential previous redirects (in the case its an
   *     http connection). The origin URL will be written into an attribute "originalsource" in the
   *     case it differs from the one into Metadata.SOURCE. To determine whether an url was modified
   *     or not, the method needs a configured crawling history.
   * @param parseContext the parsing context to specify a crawling history. Can be null, in this
   *     case no history will be used (of course ;) )
   * @return the metadata object, enriched with new metadata (in the case this metadata was not set
   *     yet)
   */
  @Override
  public Metadata addFirstMetadata(
      URLName url2getMetadata, Metadata metadata2fill, ParseContext parseContext) throws Exception {

    if (metadata2fill == null) metadata2fill = new Metadata();

    // wenn das Teil schon gefüllt ist, dann machen wir gar nix
    if (!(metadata2fill.get(Metadata.SOURCE) == null
        || metadata2fill.get(Metadata.RESOURCE_NAME_KEY) == null
        || metadata2fill.get(Metadata.CONTENT_ENCODING) == null
        || metadata2fill.get(Metadata.CONTENT_TYPE) == null
        || metadata2fill.get(Metadata.CONTENT_LOCATION) == null
        || metadata2fill.get(IncrementalCrawlingHistory.dataEntityContentFingerprint) == null
        || metadata2fill.get(IncrementalCrawlingHistory.dataEntityExistsID) == null)) {
      // alle sind bereits gesetzt
      return metadata2fill;
    }

    IncrementalCrawlingHistory crawlingHistory = null;

    if (parseContext == null) parseContext = new ParseContext();

    CrawlerContext crawlerContext = parseContext.get(CrawlerContext.class, new CrawlerContext());

    crawlingHistory = crawlerContext.getIncrementalCrawlingHistory();

    // müssen wir hier evtl. die Lucene-Teile auch wieder closen? das ist immerhin eine
    // utility-Methode^^ och - wir haben ja auch noch nen
    // shutdown hook, und nach dem crawl wirds eh geschlossen. Klingt safe
    if (crawlingHistory != null) crawlingHistory.openLuceneStuff();

    // keep a backup of the originally passed url
    String strOriginalUrlString = url2getMetadata.toString();
    metadata2fill.set(Metadata.SOURCE, strOriginalUrlString);

    URLConnection connection = null;
    int nrRedirections = 0;

    String strCurrentUrl = url2getMetadata.toString();
    // We're going to loop, accessing urls until we arrive at a url that is not redirected. The
    // redirection is followed manually rather than automatically, which is HttpURLConnection's
    // default behaviour, so that we know the actual url we arrive at.
    while (true) {
      // check if we haven't been redirected too often
      if (nrRedirections > MAX_REDIRECTIONS) {
        throw new IOException(
            "too many redirections, max = " + MAX_REDIRECTIONS + ", url = " + strOriginalUrlString);
      }

      // normalize the URL
      URL currentUrl = new URL(strCurrentUrl);
      currentUrl = new URL(UrlUtil.normalizeURL(new URLName(currentUrl)).toString());
      strCurrentUrl = currentUrl.toExternalForm();

      // see if a date was registered for this url
      Date ifModifiedSinceDate = null;
      if (crawlingHistory != null) {
        String lastIfModifiedSinceDate =
            crawlingHistory.getDataEntityContentFingerprint(strCurrentUrl);
        if (lastIfModifiedSinceDate != null)
          ifModifiedSinceDate = new Date(Long.valueOf(lastIfModifiedSinceDate));
      }

      try {
        // maybe there exists other connections as http - in this case we want to fall back zu
        // standard Tika behaviour
        connection = currentUrl.openConnection();

        if (!(connection instanceof HttpURLConnection)) break;

        connection.setConnectTimeout(connectTimeout);
        connection.setReadTimeout(readTimeout);
        connection.setRequestProperty("Accept-Encoding", "gzip");
        ((HttpURLConnection) connection).setInstanceFollowRedirects(false);
        if (ifModifiedSinceDate != null) {
          connection.setIfModifiedSince(ifModifiedSinceDate.getTime());
        }

        // send the request to the server
        connection.connect();
      } catch (Exception e) {
        // I've seen IllegalArgumentExceptions in the sun.net classes here because of some freaky
        // URLs
        // that did not generate MalformedUrlExceptions, so therefore a "catch "Exception" to be
        // sure
        if (e instanceof IOException) {
          throw (IOException) e;
        } else {
          throw new LeechException(
              "connection to " + strOriginalUrlString + " resulted in an exception", e);
        }
      }

      // check for http-specific response codes
      int responseCode = ((HttpURLConnection) connection).getResponseCode();

      if (isRedirected(responseCode)) {
        // follow the redirected url
        String lastUrl = strCurrentUrl;
        strCurrentUrl = getRedirectedUrl(currentUrl, connection);
        nrRedirections++;

        // check for urls that redirect to themselves
        if (strCurrentUrl.equals(lastUrl)) {
          throw new LeechException("url redirects to itself: " + strCurrentUrl);
        }
      } else if (responseCode == HttpURLConnection.HTTP_NOT_FOUND) {
        throw new LeechException(strCurrentUrl + "not found");
      } else if (responseCode == HttpURLConnection.HTTP_NOT_MODIFIED) {
        // des isch nicht modifiziert seit dem letzten crawl - wir geben die ('modification') time
        // des letzten crawls zurück, damit des teil
        // als unmodifiziert erkannt wird.
        if (crawlingHistory != null && ifModifiedSinceDate != null)
          metadata2fill.set(
              IncrementalCrawlingHistory.dataEntityContentFingerprint,
              String.valueOf(ifModifiedSinceDate.getTime()));

        break;
      } else if (responseCode != HttpURLConnection.HTTP_OK) {
        // this is a communication error, quit with an exception
        throw new IOException(
            "Http connection error, response code = " + responseCode + ", url = " + currentUrl);
      } else {
        // we're done
        break;
      }
    }

    if (metadata2fill.get(IncrementalCrawlingHistory.dataEntityContentFingerprint) == null)
      metadata2fill.set(
          IncrementalCrawlingHistory.dataEntityContentFingerprint,
          String.valueOf(System.currentTimeMillis()));

    // die Einträge, die Tika auch in das metadata einträgt, und noch etwas dazu

    metadata2fill.set(Metadata.RESOURCE_NAME_KEY, strCurrentUrl);

    metadata2fill.set(Metadata.SOURCE, strCurrentUrl);
    metadata2fill.set(IncrementalCrawlingHistory.dataEntityExistsID, strCurrentUrl);

    if (strOriginalUrlString.indexOf(strCurrentUrl) == -1)
      metadata2fill.set("originalsource", strOriginalUrlString);

    String type = connection.getContentType();
    // text/xml is far too general to select the right parser
    if (type != null && !type.contains("text/xml")) metadata2fill.set(Metadata.CONTENT_TYPE, type);

    String encoding = connection.getContentEncoding();
    if (encoding != null) metadata2fill.set(Metadata.CONTENT_ENCODING, encoding);

    int length = connection.getContentLength();
    if (length >= 0) metadata2fill.set(Metadata.CONTENT_LENGTH, Integer.toString(length));

    // das brauchen wir noch, um relative links aufzulösen
    metadata2fill.set(Metadata.CONTENT_LOCATION, strCurrentUrl);

    return metadata2fill;
  }
Пример #7
0
 public Transport getTransport(URLName var1) throws NoSuchProviderException {
   String var2 = var1.getProtocol();
   Provider var3 = this.getProvider(var2);
   return this.getTransport(var3, var1);
 }
Пример #8
0
 public Store getStore(URLName var1) throws NoSuchProviderException {
   String var2 = var1.getProtocol();
   Provider var3 = this.getProvider(var2);
   return this.getStore(var3, var1);
 }
Пример #9
0
  @Override
  protected Iterator<MultiValueHashMap<String, Object>> getSubDataEntitiesInformation(
      InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
      throws Exception {

    // imap url schema: imap[s]://uname@hostname:port/folder;uidvalidity=385759045/;uid=20. Examples
    // (incl. message-referenzierung)
    // http://xml.resource.org/public/rfc/html/rfc2192.html#anchor10
    // allerdings nimmt der Java ImapStore auch URLs mit Passwörtern an. Dann geht auch
    // imap[s]://uname:pwd@hostname:port/folder;uidvalidity=385759045/;uid=20

    CrawlerContext crawlerContext = context.get(CrawlerContext.class, new CrawlerContext());

    String strContainerURL = metadata.get(Metadata.SOURCE);

    URLName containerURLName = new URLName(strContainerURL);

    if (m_mailStore == null) m_mailStore = connect2Server(containerURLName, context);

    // wenn kein directory angegeben wird, dann crawlen wir einfach den default folder und die inbox
    LinkedList<Folder> llFolderz2Crawl = new LinkedList<Folder>();
    if (containerURLName.getFile() != null) {
      Folder folder = m_mailStore.getFolder(containerURLName.getFile());
      if (folder != null && folder.exists()) llFolderz2Crawl.add(folder);
      else throw new FileNotFoundException("Can't find imap folder '" + folder.getFullName() + "'");

    } else {
      Folder folder = m_mailStore.getDefaultFolder();
      if (folder != null && folder.exists()) llFolderz2Crawl.add(folder);

      folder = m_mailStore.getFolder("INBOX");
      if (folder != null && folder.exists()) llFolderz2Crawl.add(folder);
    }

    LinkedList<MultiValueHashMap<String, Object>> llEntityInfo =
        new LinkedList<MultiValueHashMap<String, Object>>();

    for (Folder folder2crawl : llFolderz2Crawl) {
      // Jetzt haben wir die Containerobjekte - nun geben wir die Daten zu den SubEntities zurück

      // die subfolder
      boolean bFolderCanHaveSubFolders =
          (folder2crawl.getType() & Folder.HOLDS_FOLDERS) == Folder.HOLDS_FOLDERS;

      if (bFolderCanHaveSubFolders) {
        folder2crawl.open(Folder.READ_ONLY);

        Folder[] subFolders = folder2crawl.list();
        for (Folder subFolder : subFolders) {
          URLName urlName = subFolder.getURLName();
          URLName urlNameWithPassword =
              new URLName(
                  containerURLName.getProtocol(),
                  urlName.getHost(),
                  urlName.getPort(),
                  urlName.getFile(),
                  urlName.getUsername(),
                  containerURLName.getPassword());

          if (!checkIfInConstraints(urlName.toString(), null, context)) continue;

          MultiValueHashMap<String, Object> hsEntityInformation =
              new MultiValueHashMap<String, Object>();

          hsEntityInformation.add(CrawlerParser.SOURCEID, urlName);
          hsEntityInformation.add("urlNameWithPassword", urlNameWithPassword);
          hsEntityInformation.add("folder", subFolder.getFullName());

          llEntityInfo.add(hsEntityInformation);
        }
      }

      // die messages
      boolean bFolderCanHaveMessages =
          (folder2crawl.getType() & Folder.HOLDS_MESSAGES) == Folder.HOLDS_MESSAGES;

      if (bFolderCanHaveMessages) {
        if (!folder2crawl.isOpen()) folder2crawl.open(Folder.READ_ONLY);

        // wir holen uns alle nicht-deleted messages, und werfen noch die raus, die 'expunged' sind
        Message[] relevantMessagesOfFolder =
            folder2crawl.search(new FlagTerm(new Flags(Flags.Flag.DELETED), false));
        ArrayList<Message> nonDelNonExpungedMessages = new ArrayList<Message>();
        for (Message message : relevantMessagesOfFolder)
          if (!message.isExpunged()) nonDelNonExpungedMessages.add(message);
        relevantMessagesOfFolder = nonDelNonExpungedMessages.toArray(new Message[0]);

        // die Daten die wir später benötigen holen wir uns effizient in einem Rutsch - deswegen
        // benötigen wir auch keinen Thread mit dem
        // OneAfterOneIterator, um Speicher zu sparen (siehe DirectoryCrawlerParser). Das Array
        // haben wir hier eh. Entweder oder.
        FetchProfile profile = new FetchProfile();
        profile.add(UIDFolder.FetchProfileItem.UID);
        profile.add("Message-ID");
        folder2crawl.fetch(relevantMessagesOfFolder, profile);

        for (int i = 0;
            i < relevantMessagesOfFolder.length && !crawlerContext.stopRequested();
            i++) {
          MimeMessage message = (MimeMessage) relevantMessagesOfFolder[i];

          // hier brauchen wir noch eine URL mit und eine ohne Passwort
          URLName urlName = getMessageUrl(folder2crawl, message);
          URLName urlNameWithPassword =
              new URLName(
                  containerURLName.getProtocol(),
                  urlName.getHost(),
                  urlName.getPort(),
                  urlName.getFile(),
                  urlName.getUsername(),
                  containerURLName.getPassword());

          if (!checkIfInConstraints(urlName.toString(), message, context)) continue;

          MultiValueHashMap<String, Object> hsEntityInformation =
              new MultiValueHashMap<String, Object>();

          hsEntityInformation.add(CrawlerParser.SOURCEID, urlName);
          hsEntityInformation.add("urlNameWithPassword", urlNameWithPassword);
          hsEntityInformation.add("Message-ID", message.getHeader("Message-ID")[0]);
          hsEntityInformation.add("folder", folder2crawl.getFullName());

          llEntityInfo.add(hsEntityInformation);
        }
      }

      // wir haben die folder abgearbeitet, dann können wir diesen Speicher wieder frei geben
      m_hsImapFolder2Stickyness.clear();

      if (folder2crawl.isOpen()) folder2crawl.close(false);
    }

    return llEntityInfo.iterator();
  }