/**
   * Tests that:
   *
   * <ol>
   *   <li>The given domain configuration and job are not null.
   *   <li>The job does not already contain the given domain configuration.
   *   <li>The domain configuration has the same order xml name as the first inserted domain config.
   * </ol>
   *
   * @param job a given Job
   * @param cfg a given DomainConfiguration
   * @return true, if the given DomainConfiguration can be inserted into the given job
   */
  private boolean checkAddDomainConfInvariant(Job job, DomainConfiguration cfg) {
    ArgumentNotValid.checkNotNull(job, "job");
    ArgumentNotValid.checkNotNull(cfg, "cfg");

    // check if domain in DomainConfiguration cfg is not already in this job
    // domainName is used as key in domainConfigurationMap
    if (job.getDomainConfigurationMap().containsKey(cfg.getDomainName())) {
      if (log.isDebugEnabled()) {
        log.debug("Job already has a configuration for Domain '" + cfg.getDomainName() + "'.");
      }
      return false;
    }

    // check if template is same as this job.
    String orderXMLname = job.getOrderXMLName();
    if (!orderXMLname.equals(cfg.getOrderXmlName())) {
      if (log.isDebugEnabled()) {
        log.debug(
            "This Job only accept configurations "
                + "using the harvest template '"
                + orderXMLname
                + "'. This configuration uses the harvest template '"
                + cfg.getOrderXmlName()
                + "'.");
      }
      return false;
    }

    return true;
  }
  @Override
  public boolean postProcess(InputStream input, OutputStream output) {
    ArgumentNotValid.checkNotNull(input, "InputStream input");
    ArgumentNotValid.checkNotNull(output, "OutputStream output");

    // Let the loaded job handle the post processing.
    loadBatchJob();
    return loadedJob.postProcess(input, output);
  }
  /**
   * Retrieves a set with the name of the files with a specific ReplicaStoreState in a specific
   * replica.
   *
   * @param rep The replica where the files belong.
   * @param state The ReplicaStoreState for the files.
   * @return A set with the names of the files with a specific ReplicaStoreState in a specific
   *     replica.
   * @throws ArgumentNotValid If the Replica or the ReplicaStoreState is null.
   */
  @Override
  public Set<String> getAllFileNames(Replica rep, ReplicaStoreState state) throws ArgumentNotValid {
    ArgumentNotValid.checkNotNull(rep, "Replica rep");
    ArgumentNotValid.checkNotNull(state, "ReplicaStoreState state");

    // initialise the set
    Set<String> res = new HashSet<String>();
    // put the collection of filenames into the set.
    res.addAll(database.retrieveFilenamesForReplicaEntries(rep.getId(), state));
    // return the set.
    return res;
  }
  /**
   * Message to signal from a BitarchiveServer to the BitarchiveMonitorServer that the Bit Archive
   * Application identified by BA_ApplicationId has completed its part of the batch job.
   *
   * <p>Holds status information: list of files processed and a list of ARC files (file names) on
   * which the batch job failed.
   *
   * @param to the channel to which this message is to be sent (must be a BAMON channel)
   * @param originatingBatchMsgId the Id field from the original batch message
   * @param status The object containing status info.
   */
  public BatchEndedMessage(ChannelID to, String originatingBatchMsgId, BatchStatus status) {
    super(to, Channels.getError());
    ArgumentNotValid.checkNotNull(to, "to");
    ArgumentNotValid.checkNotNullOrEmpty(originatingBatchMsgId, "String originatingBatchMsgId");
    ArgumentNotValid.checkNotNull(status, "BatchStatus status");

    this.originatingBatchMsgId = originatingBatchMsgId;
    this.baApplicationId = status.getBitArchiveAppId();
    this.rf = status.getResultFile();
    this.noOfFilesProcessed = status.getNoOfFilesProcessed();
    this.filesFailed = status.getFilesFailed();
    this.exceptions = status.getExceptions();
  }
  /**
   * Assign a StoreMessage to a specific filename. If the filename is already associated with a
   * StoreMessage, then this StoreMessage will be overwritten by the new StoreMessage.
   *
   * @param filename The name of the file to have a StoreMessage assigned.
   * @param msg The StoreMessage to be assigned to a file.
   * @throws ArgumentNotValid If the StoreMessage is null or if the filename is either null or the
   *     empty string.
   */
  @Override
  public void setReplyInfo(String filename, StoreMessage msg) throws ArgumentNotValid {
    ArgumentNotValid.checkNotNull(msg, "StoreMessage msg");
    ArgumentNotValid.checkNotNullOrEmpty(filename, "String filename");

    // put into the map, and overwrite any existing mapping.
    storeEntries.put(filename, msg);
  }
 /**
  * Unpacks and calls accept() on the message object.
  *
  * <p>This method catches <b>all</b> exceptions and logs them.
  *
  * @param msg a ObjectMessage
  */
 public void onMessage(Message msg) {
   ArgumentNotValid.checkNotNull(msg, "Message msg");
   log.trace("Message received:\n{}", msg.toString());
   try {
     ((ArchiveMessage) JMSConnection.unpack(msg)).accept(this);
   } catch (ClassCastException e) {
     log.warn("Invalid message type", e);
   } catch (Throwable t) {
     log.warn("Error processing message '{}'", msg, t);
   }
 }
  /**
   * Creates a new NetarkivetMessage.
   *
   * @param to the initial receiver of the message
   * @param replyTo the initial sender of the message
   * @throws ArgumentNotValid if to==replyTo, the replyTo parameter is a topic instead of a queue,
   *     or there is a null parameter.
   */
  protected NetarkivetMessage(ChannelID to, ChannelID replyTo) {
    ArgumentNotValid.checkNotNull(to, "to");
    ArgumentNotValid.checkNotNull(replyTo, "replyTo");

    if (to.getName().equals(replyTo.getName())) {
      throw new ArgumentNotValid("to and replyTo should not be equal.");
    }

    // Have not implemented replying to a topic because there is no use
    // for it in our current architecture
    if (Channels.isTopic(replyTo.getName())) {
      throw new ArgumentNotValid(
          "Reply channel must be queue but " + replyTo.toString() + " is a Topic");
    }

    this.to = to;
    this.replyTo = replyTo;
    this.id = null;
    this.replyOfId = null;
  }
  /**
   * Create a new batch job that runs the loaded class.
   *
   * @param classFile the classfile for the batch job we want to run.
   * @param arguments The arguments for the batchjobs. This can be null.
   * @throws ArgumentNotValid If the classfile is null.
   */
  public LoadableFileBatchJob(File classFile, List<String> arguments) throws ArgumentNotValid {
    ArgumentNotValid.checkNotNull(classFile, "File classFile");
    fileContents = FileUtils.readBinaryFile(classFile);
    fileName = classFile.getName();
    if (arguments == null) {
      this.args = new ArrayList<String>();
    } else {
      this.args = arguments;
    }

    loadBatchJob();
  }
  /**
   * Message to signal from a BitarchiveServer to the BitarchiveMonitorServer that the Bit Archive
   * Application identified by BA_ApplicationId has completed its part of the batch job.
   *
   * <p>Holds status information: list of files processed and a list of ARC files (file names) on
   * which the batch job failed.
   *
   * @param to the channel to which this message is to be sent (must be a BAMON channel)
   * @param baAppId Identifier for the machine sending this message, usually containing the IP
   *     address and http port number
   * @param originatingBatchMsgId the Id field from the original batch message
   * @param rf he remote file reference containing the output of the batch job (may be null if no
   *     output is generated).
   * @throws ArgumentNotValid If the BA_ApplicationId or the originatingBatchMsgId are null or
   *     empty, or if the channel 'to' is null.
   */
  public BatchEndedMessage(
      ChannelID to, String baAppId, String originatingBatchMsgId, RemoteFile rf)
      throws ArgumentNotValid {
    super(to, Channels.getError());
    ArgumentNotValid.checkNotNull(to, "ChannelID to");
    ArgumentNotValid.checkNotNullOrEmpty(baAppId, "String baAppId");
    ArgumentNotValid.checkNotNullOrEmpty(originatingBatchMsgId, "String originatingBatchMsgId");

    this.baApplicationId = baAppId;
    this.originatingBatchMsgId = originatingBatchMsgId;
    this.rf = rf;
  }
  /**
   * Sets the store state of an entry to a specific value.
   *
   * @param filename The name of the file for the entry.
   * @param repChannelId The identification channel of the replica for the entry.
   * @param state The new state for the entry.
   * @throws ArgumentNotValid If the ReplicaStoreState is null, or if either the filename or the
   *     replica identification channel is either null or the empty string.
   */
  @Override
  public void setState(String filename, String repChannelId, ReplicaStoreState state)
      throws ArgumentNotValid {
    ArgumentNotValid.checkNotNull(state, "ReplicaStoreState state");
    ArgumentNotValid.checkNotNullOrEmpty(filename, "String filename");
    ArgumentNotValid.checkNotNullOrEmpty(repChannelId, "String repChannelId");

    // retrieve the replica
    Replica rep = Channels.retrieveReplicaFromIdentifierChannel(repChannelId);

    // update the database.
    database.setReplicaStoreState(filename, rep.getId(), state);
  }
 /**
  * Process a record on crawl log concerning the given domain to result.
  *
  * @param record The record to process.
  * @param os The output stream for the result.
  * @throws ArgumentNotValid on null parameters
  * @throws IOFailure on trouble processing the record.
  */
 @Override
 public void processRecord(ArchiveRecordBase record, OutputStream os) {
   ArgumentNotValid.checkNotNull(record, "ArchiveRecordBase record");
   ArgumentNotValid.checkNotNull(os, "OutputStream os");
   BufferedReader arcreader = new BufferedReader(new InputStreamReader(record.getInputStream()));
   try {
     for (String line = arcreader.readLine(); line != null; line = arcreader.readLine()) {
       if (line.matches(regexp)) {
         os.write(line.getBytes("UTF-8"));
         os.write('\n');
       }
     }
   } catch (IOException e) {
     throw new IOFailure("Unable to process (w)arc record", e);
   } finally {
     try {
       arcreader.close();
     } catch (IOException e) {
       log.warn("unable to close arcreader probably", e);
     }
   }
 }
Пример #12
0
 /**
  * Set the actual time when this job was started.
  *
  * <p>Sends a notification, if actualStart is set to a time after actualStop.
  *
  * @param actualStart A Date object representing the time when this job was started.
  */
 public void setActualStart(Date actualStart) {
   ArgumentNotValid.checkNotNull(actualStart, "actualStart");
   if (actualStop != null && actualStop.before(actualStart)) {
     log.warn(
         "Job("
             + getJobID()
             + "): Start time ("
             + actualStart
             + ") is after end time: "
             + actualStop);
   }
   this.actualStart = (Date) actualStart.clone();
 }
Пример #13
0
 /**
  * Set the actual time when this job was stopped/completed. Sends a notification, if actualStop is
  * set to a time before actualStart.
  *
  * @param actualStop A Date object representing the time when this job was stopped.
  * @throws ArgumentNotValid
  */
 public void setActualStop(Date actualStop) throws ArgumentNotValid {
   ArgumentNotValid.checkNotNull(actualStop, "actualStop");
   if (actualStart == null) {
     log.warn("Job(" + getJobID() + "): actualStart should be defined before setting actualStop");
   } else if (actualStop.before(actualStart)) {
     log.warn(
         "Job("
             + getJobID()
             + "): actualStop ("
             + actualStop
             + ") is before actualStart: "
             + actualStart);
   }
   this.actualStop = (Date) actualStop.clone();
 }
Пример #14
0
  /**
   * Sets status of this job.
   *
   * @param newStatus Must be one of the values STATUS_NEW, ..., STATUS_FAILED
   * @throws ArgumentNotValid in case of invalid status argument or invalid status change
   */
  public void setStatus(JobStatus newStatus) {
    ArgumentNotValid.checkNotNull(newStatus, "newStatus");
    if (!status.legalChange(newStatus)) {
      final String message =
          "Status change from " + status + " to " + newStatus + " is not allowed";
      log.debug(message);
      throw new ArgumentNotValid(message);
    }

    if ((this.status == JobStatus.NEW || this.status == JobStatus.RESUBMITTED)
        && newStatus == JobStatus.SUBMITTED) {
      orderXMLdoc.configureQuotaEnforcer(
          maxObjectsIsSetByQuotaEnforcer, forceMaxBytesPerDomain, forceMaxObjectsPerDomain);
    }

    if (this.status == JobStatus.SUBMITTED && newStatus == JobStatus.STARTED) {
      setActualStart(new Date());
    }
    if (this.status == JobStatus.STARTED
        && (newStatus == JobStatus.DONE || newStatus == JobStatus.FAILED)) {
      setActualStop(new Date());
    }
    status = newStatus;
  }
 /**
  * This method should be overridden and implemented by a sub class if message handling is wanted.
  *
  * @param msg an GetAllFilenamesMessage
  * @throws PermissionDenied when invoked
  */
 public void visit(GetAllFilenamesMessage msg) throws PermissionDenied {
   ArgumentNotValid.checkNotNull(msg, "msg");
   deny(msg);
 }
Пример #16
0
 /**
  * Set the orderxml for this job.
  *
  * @param doc A orderxml to be used by this job
  */
 public void setOrderXMLDoc(HeritrixTemplate doc) {
   ArgumentNotValid.checkNotNull(doc, "doc");
   this.orderXMLdoc = doc;
 }
Пример #17
0
  /**
   * Package private constructor for common initialisation.
   *
   * @param harvestID the id of the harvestdefinition
   * @param cfg the configuration to base the Job on
   * @param orderXMLdoc
   * @param channel the channel on which the job will be submitted.
   * @param forceMaxObjectsPerDomain the maximum number of objects harvested from a domain,
   *     overrides individual configuration settings. -1 means no limit
   * @param forceMaxBytesPerDomain The maximum number of objects harvested from a domain, or -1 for
   *     no limit.
   * @param forceMaxJobRunningTime The max time in seconds given to the harvester for this job
   * @param harvestNum the run number of the harvest definition
   * @throws ArgumentNotValid if cfg or priority is null or harvestID is invalid, or if any limit <
   *     -1
   */
  public Job(
      Long harvestID,
      DomainConfiguration cfg,
      HeritrixTemplate orderXMLdoc,
      HarvestChannel channel,
      long forceMaxObjectsPerDomain,
      long forceMaxBytesPerDomain,
      long forceMaxJobRunningTime,
      int harvestNum)
      throws ArgumentNotValid {
    ArgumentNotValid.checkNotNull(cfg, "cfg");
    ArgumentNotValid.checkNotNull(harvestID, "harvestID");
    ArgumentNotValid.checkNotNegative(harvestID, "harvestID");
    ArgumentNotValid.checkNotNull(channel, "channel");

    if (forceMaxObjectsPerDomain < -1) {
      String msg = "forceMaxObjectsPerDomain must be either -1 or positive";
      log.debug(msg);
      throw new ArgumentNotValid(msg);
    }
    if (forceMaxBytesPerDomain < -1) {
      String msg = "forceMaxBytesPerDomain must be either -1 or positive";
      log.debug(msg);
      throw new ArgumentNotValid(msg);
    }

    if (forceMaxBytesPerDomain == 0L) {
      log.warn(
          "forceMaxBytesPerDomain should probably not be 0.Means 0 bytes downloaded per domain");
    }

    if (forceMaxObjectsPerDomain == 0L) {
      log.warn(
          "forceMaxObjectsPerDomain should probably not be 0.Means 0 objects downloaded per domain");
    }

    // setup initial members
    domainConfigurationMap = new HashMap<>();
    origHarvestDefinitionID = harvestID;
    orderXMLname = cfg.getOrderXmlName();
    this.orderXMLdoc = orderXMLdoc;

    setHarvestChannel(channel);

    long maxObjects = NumberUtils.minInf(forceMaxObjectsPerDomain, cfg.getMaxObjects());
    setMaxObjectsPerDomain(maxObjects);
    configurationSetsObjectLimit = (maxObjects != forceMaxObjectsPerDomain);

    long maxBytes = NumberUtils.minInf(forceMaxBytesPerDomain, cfg.getMaxBytes());
    setMaxBytesPerDomain(maxBytes);
    configurationSetsByteLimit = (maxBytes != forceMaxBytesPerDomain);

    long expectation =
        cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain);
    maxCountObjects = expectation;
    minCountObjects = expectation;
    this.harvestNum = harvestNum;

    addConfiguration(cfg);

    setMaxJobRunningTime(forceMaxJobRunningTime);

    setArchiveFormatInTemplate(Settings.get(HarvesterSettings.HERITRIX_ARCHIVE_FORMAT));

    setAttributes(cfg.getAttributesAndTypes());

    orderXMLdoc.enableOrDisableDeduplication(
        Settings.getBoolean(HarvesterSettings.DEDUPLICATION_ENABLED));

    status = JobStatus.NEW;
  }
Пример #18
0
  /**
   * Adds a configuration to this Job. Seedlists and settings are updated accordingly.
   *
   * @param cfg the configuration to add
   * @throws ArgumentNotValid if cfg is null or cfg uses a different orderxml than this job or if
   *     this job already contains a configuration associated with domain of configuration cfg.
   */
  public void addConfiguration(DomainConfiguration cfg) {
    ArgumentNotValid.checkNotNull(cfg, "cfg");
    if (domainConfigurationMap.containsKey(cfg.getDomainName())) {
      throw new ArgumentNotValid(
          "Job already has a configuration for Domain " + cfg.getDomainName());
    }

    if (log.isTraceEnabled()) {
      log.trace("Adding configuration '{}' to job '{}'", cfg, cfg.getName());
    }

    if (!underConstruction) {
      final String msg = "Cannot modify job " + this + " as it is no longer under construction";
      log.debug(msg);
      throw new IllegalState(msg);
    }

    if (!cfg.getOrderXmlName().equals(getOrderXMLName())) {
      throw new ArgumentNotValid(
          "Job requires the orderxml file:'"
              + getOrderXMLName()
              + "' not:'"
              + cfg.getOrderXmlName()
              + "' used by the configuration:'"
              + cfg.getName());
    }

    domainConfigurationMap.put(cfg.getDomainName(), cfg.getName());

    // Add the seeds from the configuration to the Job seeds.
    // Take care of duplicates.
    for (Iterator<SeedList> itt = cfg.getSeedLists(); itt.hasNext(); ) {
      SeedList seed = itt.next();
      List<String> seeds = seed.getSeeds();
      for (String seedUrl : seeds) {
        seedListSet.add(seedUrl); // duplicates is silently ignored

        // TODO remove when heritrix implements this functionality
        // try to convert a seed into a Internationalized Domain Name
        try {
          String seedASCII = seedUrl;
          // It is rare to see these seeds, but they need to be
          // correctly idnaized
          if (seedUrl.contains(":") || seedUrl.contains("/")) {
            String normalizedUrl = seedUrl;
            if (!normalizedUrl.matches("^[a-zA-Z]+:.*")) {
              // If no protocol is given, assume http
              normalizedUrl = "http://" + normalizedUrl;
            }
            URL url = new URL(normalizedUrl);
            String domainName = url.getHost();
            String domainNameASCII = IDNA.toASCII(domainName);
            if (!domainName.equals(domainNameASCII)) {
              // If the domain name changed, replace that in the
              // seed.
              seedASCII = seedUrl.replaceFirst(Pattern.quote(domainName), domainNameASCII);
            }
          } else {
            seedASCII = IDNA.toASCII(seedUrl);
          }
          if (!seedASCII.equals(seedUrl)) {
            log.trace("Converted {} to {}", seedUrl, seedASCII);
            // Note that duplicates is silently ignored
            seedListSet.add(seedASCII);
          }
        } catch (IDNAException e) {
          log.trace("Cannot convert seed {} to ASCII", seedUrl, e);
        } catch (MalformedURLException e) {
          log.trace("Cannot convert seed {} to ASCII", seedUrl, e);
        }
      }
    }

    orderXMLdoc.editOrderXMLAddPerDomainCrawlerTraps(cfg);

    // TODO update limits in settings files - see also bug 269

    // Update estimates of job size
    long expectation =
        cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain);
    maxCountObjects = Math.max(expectation, maxCountObjects);
    minCountObjects = Math.min(expectation, minCountObjects);
    totalCountObjects += expectation;

    configsChanged = true;

    assert (maxCountObjects >= minCountObjects) : "basic invariant";
  }
 /**
  * Initialize the job before runnning. This is called before the processFile() calls.
  *
  * @param os the OutputStream to which output should be written
  */
 public void initialize(OutputStream os) {
   ArgumentNotValid.checkNotNull(os, "OutputStream os");
   loadBatchJob();
   loadedJob.initialize(os);
 }
 /**
  * Process one file stored in the bit archive.
  *
  * @param file the file to be processed.
  * @param os the OutputStream to which output should be written
  * @return true if the file was successfully processed, false otherwise
  */
 public boolean processFile(File file, OutputStream os) {
   log.trace("Started processing of file '" + file.getAbsolutePath() + "'.");
   ArgumentNotValid.checkNotNull(file, "File file");
   ArgumentNotValid.checkNotNull(os, "OutputStream os");
   return loadedJob.processFile(file, os);
 }
 /**
  * Finish up the job. This is called after the last process() call.
  *
  * @param os the OutputStream to which output should be written
  */
 public void finish(OutputStream os) {
   ArgumentNotValid.checkNotNull(os, "OutputStream os");
   loadedJob.finish(os);
 }
 /**
  * This method should be overridden and implemented by a sub class if message handling is wanted.
  *
  * @param msg an CorrectMessage for correcting a record.
  * @throws PermissionDenied when invoked
  */
 public void visit(CorrectMessage msg) throws PermissionDenied {
   ArgumentNotValid.checkNotNull(msg, "msg");
   deny(msg);
 }
 /**
  * This method should be overridden and implemented by a sub class if message handling is wanted.
  *
  * @param msg a RemoveAndGetFile
  * @throws PermissionDenied when invoked
  */
 public void visit(RemoveAndGetFileMessage msg) throws PermissionDenied {
   ArgumentNotValid.checkNotNull(msg, "msg");
   deny(msg);
 }