Esempio n. 1
0
  private boolean startReindexingMetadata(ArchivalUnit au, boolean force) {
    if (metadataMgr == null) {
      errMsg = "Metadata processing is not enabled.";
      return false;
    }

    if (!force) {
      if (!AuUtil.hasCrawled(au)) {
        errMsg = "Au has never crawled. Click again to reindex metadata";
        showForceReindexMetadata = true;
        return false;
      }

      AuState auState = AuUtil.getAuState(au);
      switch (auState.getSubstanceState()) {
        case No:
          errMsg = "Au has no substance. Click again to reindex metadata";
          showForceReindexMetadata = true;
          return false;
        case Unknown:
          errMsg = "Unknown substance for Au. Click again to reindex metadata.";
          showForceReindexMetadata = true;
          return false;
        case Yes:
          // fall through
      }
    }

    // Fully reindex metadata with the highest priority.
    Connection conn = null;
    PreparedStatement insertPendingAuBatchStatement = null;

    try {
      conn = dbMgr.getConnection();
      insertPendingAuBatchStatement = metadataMgr.getPrioritizedInsertPendingAuBatchStatement(conn);

      if (metadataMgr.enableAndAddAuToReindex(
          au, conn, insertPendingAuBatchStatement, false, true)) {
        statusMsg = "Reindexing metadata for " + au.getName();
        return true;
      }
    } catch (DbException dbe) {
      log.error("Cannot reindex metadata for " + au.getName(), dbe);
    } finally {
      DbManager.safeCloseStatement(insertPendingAuBatchStatement);
      DbManager.safeRollbackAndClose(conn);
    }

    if (force) {
      errMsg = "Still cannot reindex metadata for " + au.getName();
    } else {
      errMsg = "Cannot reindex metadata for " + au.getName();
    }
    return false;
  }
  /**
   * Return a TitleConfig for the AU. Returns matching entry from the title db if found, else
   * creates one
   */
  TitleConfig titleConfigFromAu(InactiveAuProxy au) {
    PluginProxy plugin = au.getPlugin();
    String auname = au.getName();
    Configuration auConfig = au.getConfiguration();
    TitleConfig tc = AuUtil.findTitleConfig(auConfig, plugin.getPlugin());
    if (tc != null) {
      return tc;
    }

    tc = new TitleConfig(auname, plugin.getPluginId());
    ArrayList<ConfigParamAssignment> params = new ArrayList();
    for (Iterator iter = auConfig.keyIterator(); iter.hasNext(); ) {
      String key = (String) iter.next();
      if (!ConfigParamDescr.isReservedParam(key)) {
        String val = auConfig.get(key);
        ConfigParamDescr descr = plugin.findAuConfigDescr(key);
        if (descr != null) {
          ConfigParamAssignment cpa = new ConfigParamAssignment(descr, val);
          params.add(cpa);
        } else {
          log.warning("Unknown parameter key: " + key + " in au: " + auname);
        }
      }
    }
    params.trimToSize();
    tc.setParams(params);
    return tc;
  }
Esempio n. 3
0
 protected void writeCu(CachedUrl cu) throws IOException {
   String url = cu.getUrl();
   long contentSize = cu.getContentSize();
   CIProperties props = cu.getProperties();
   long fetchTime = Long.parseLong(props.getProperty(CachedUrl.PROPERTY_FETCH_TIME));
   InputStream contentIn = cu.getUnfilteredInputStream();
   try {
     if (isResponse) {
       String hdrString = getHttpResponseString(cu);
       long size = contentSize + hdrString.length();
       InputStream headerIn = new ReaderInputStream(new StringReader(hdrString));
       InputStream concat = new SequenceInputStream(headerIn, contentIn);
       try {
         aw.write(xlateFilename(url), cu.getContentType(), getHostIp(), fetchTime, size, concat);
       } finally {
         IOUtil.safeClose(concat);
       }
     } else {
       aw.write(
           xlateFilename(url),
           cu.getContentType(),
           getHostIp(),
           fetchTime,
           cu.getContentSize(),
           contentIn);
     }
   } finally {
     AuUtil.safeRelease(cu);
   }
 }
Esempio n. 4
0
 private void checkSubstance(ArchivalUnit au) {
   SubstanceChecker subChecker = new SubstanceChecker(au);
   if (!subChecker.isEnabled()) {
     errMsg = "No substance patterns defined for plugin.";
     return;
   }
   AuState auState = AuUtil.getAuState(au);
   SubstanceChecker.State oldState = auState.getSubstanceState();
   SubstanceChecker.State newState = subChecker.findSubstance();
   String chtxt = (newState == oldState ? "(unchanged)" : "(was " + oldState.toString() + ")");
   switch (newState) {
     case Unknown:
       log.error("Shouldn't happen: SubstanceChecker returned Unknown");
       errMsg = "Error in SubstanceChecker; see log.";
       break;
     case Yes:
       statusMsg = "AU has substance " + chtxt + ": " + au.getName();
       auState.setSubstanceState(SubstanceChecker.State.Yes);
       break;
     case No:
       statusMsg = "AU has no substance " + chtxt + ": " + au.getName();
       auState.setSubstanceState(SubstanceChecker.State.No);
       break;
   }
 }
 private List getSummaryInfo(CrawlerStatus status) {
   List res = new ArrayList();
   StatusTable.SummaryInfo statusSi =
       new StatusTable.SummaryInfo(
           "Status", ColumnDescriptor.TYPE_STRING, status.getCrawlStatusMsg());
   ArchivalUnit au = status.getAu();
   if (au != null) {
     AuState aus = AuUtil.getAuState(au);
     if (status.getCrawlStatus() == Crawler.STATUS_SUCCESSFUL && aus.hasNoSubstance()) {
       statusSi.setValueFootnote(FOOT_NO_SUBSTANCE_CRAWL_STATUS);
     }
   }
   res.add(statusSi);
   String sources = StringUtil.separatedString(status.getSources());
   res.add(new StatusTable.SummaryInfo("Source", ColumnDescriptor.TYPE_STRING, sources));
   String startUrls = StringUtil.separatedString(status.getStartUrls());
   res.add(
       new StatusTable.SummaryInfo("Starting Url(s)", ColumnDescriptor.TYPE_STRING, startUrls));
   return res;
 }
Esempio n. 6
0
 private void writeFiles() {
   PlatformUtil platutil = PlatformUtil.getInstance();
   CuIterator iter = AuUtil.getCuIterator(au);
   int errs = 0;
   CachedUrl curCu = null;
   CachedUrl nextCu = getNextCu(iter);
   while (nextCu != null) {
     curCu = nextCu;
     nextCu = getNextCu(iter);
     if (excludeDirNodes && nextCu != null && isDirOf(curCu, nextCu)) {
       continue;
     }
     CachedUrl[] cuVersions =
         curCu.getCuVersions(maxVersions > 0 ? maxVersions : Integer.MAX_VALUE);
     for (CachedUrl cu : cuVersions) {
       try {
         log.debug2("Exporting " + cu.getUrl());
         writeCu(cu);
       } catch (IOException e) {
         if (platutil.isDiskFullError(e)) {
           recordError("Disk full, can't write export file.");
           isDiskFull = true;
           return;
         }
       } catch (Exception e) {
         // XXX Would like to differentiate between errors opening or
         // reading CU, which shouldn't cause abort, and errors writing
         // to export file, which should.
         recordError("Unable to copy " + cu.getUrl(), e);
         if (errs++ >= maxErrors) {
           recordError("Aborting after " + errs + " errors");
           return;
         }
       }
     }
   }
 }
 /** engqueue a size calculation for the AU */
 public void queueSizeCalc(ArchivalUnit au) {
   queueSizeCalc(AuUtil.getAuRepoNode(au));
 }
Esempio n. 8
0
  /** Explode the archive into its constituent elements */
  public void explode() throws CacheException {
    CachedUrl cachedUrl = null;
    int goodEntries = 0;
    int badEntries = 0;
    int ignoredEntries = 0;
    int entriesBetweenSleep = 0;
    ArchiveReader arcReader = null;

    logger.info(
        (storeArchive ? "Storing" : "Fetching") + " WARC file: " + origUrl + " will explode");
    try {
      if (storeArchive) {
        UrlCacher uc = au.makeUrlCacher(new UrlData(arcStream, arcProps, fetchUrl));
        BitSet bs = new BitSet();
        bs.set(UrlCacher.DONT_CLOSE_INPUT_STREAM_FLAG);
        uc.setFetchFlags(bs);
        uc.storeContent();
        archiveData.resetInputStream();
        arcStream = archiveData.input;
      }
      // Wrap it in an ArchiveReader
      logger.debug3("About to wrap stream");
      arcReader = wrapStream(fetchUrl, arcStream);
      logger.debug3("wrapStream() returns " + (arcReader == null ? "null" : "non-null"));
      // Explode it
      if (arcReader == null) {
        throw new CacheException.ExploderException("no WarcReader for " + origUrl);
      }
      ArchivalUnit au = crawlFacade.getAu();
      Set stemSet = new HashSet();
      logger.debug("Exploding " + fetchUrl);
      // Iterate through the elements in the WARC file, except the first
      Iterator i = arcReader.iterator();
      // Skip first record
      for (i.next(); i.hasNext(); ) {
        // XXX probably not necessary
        helper.pokeWDog();
        if ((++entriesBetweenSleep % sleepAfter) == 0) {
          long pauseTime =
              CurrentConfig.getTimeIntervalParam(PARAM_RETRY_PAUSE, DEFAULT_RETRY_PAUSE);
          Deadline pause = Deadline.in(pauseTime);
          logger.debug3("Sleeping for " + StringUtil.timeIntervalToString(pauseTime));
          while (!pause.expired()) {
            try {
              pause.sleep();
            } catch (InterruptedException ie) {
              // no action
            }
          }
        }
        ArchiveRecord element = (ArchiveRecord) i.next();
        // Each element is a URL to be cached in a suitable AU
        ArchiveRecordHeader elementHeader = element.getHeader();
        String elementUrl = elementHeader.getUrl();
        String elementMimeType = elementHeader.getMimetype();
        long elementLength = elementHeader.getLength();
        logger.debug2("WARC url " + elementUrl + " mime " + elementMimeType);
        if (elementUrl.startsWith("http:")) {
          ArchiveEntry ae =
              new ArchiveEntry(
                  elementUrl,
                  elementLength,
                  0, // XXX need to convert getDate string to long
                  element, // ArchiveRecord extends InputStream
                  this,
                  fetchUrl);
          ae.setHeaderFields(makeCIProperties(elementHeader));
          long bytesStored = elementLength;
          logger.debug3("ArchiveEntry: " + ae.getName() + " bytes " + bytesStored);
          try {
            helper.process(ae);
          } catch (PluginException ex) {
            throw new CacheException.ExploderException("helper.process() threw", ex);
          }
          if (ae.getBaseUrl() != null) {
            if (ae.getRestOfUrl() != null && ae.getHeaderFields() != null) {
              storeEntry(ae);
              handleAddText(ae);
              goodEntries++;
              crawlFacade.getCrawlerStatus().addContentBytesFetched(bytesStored);
            } else {
              ignoredEntries++;
            }
          } else {
            badEntries++;
            logger.debug2("Can't map " + elementUrl + " from " + archiveUrl);
          }
        }
      }
    } catch (IOException ex) {
      throw new CacheException.ExploderException(ex);
    } finally {
      if (arcReader != null)
        try {
          arcReader.close();
          arcReader = null;
        } catch (IOException ex) {
          throw new CacheException.ExploderException(ex);
        }
      if (cachedUrl != null) {
        cachedUrl.release();
      }
      IOUtil.safeClose(arcStream);
    }
    if (badEntries == 0 && goodEntries > 0) {
      // Make it look like a new crawl finished on each AU to which
      // URLs were added.
      for (Iterator it = touchedAus.iterator(); it.hasNext(); ) {
        ArchivalUnit au = (ArchivalUnit) it.next();
        logger.debug3(archiveUrl + " touching " + au.toString());
        AuUtil.getDaemon(au).getNodeManager(au).newContentCrawlFinished();
      }
    } else {
      ArchivalUnit au = crawlFacade.getAu();
      String msg = archiveUrl + ": " + badEntries + "/" + goodEntries + " bad entries";
      throw new CacheException.UnretryableException(msg);
    }
  }