/** * Tests that: * * <ol> * <li>The given domain configuration and job are not null. * <li>The job does not already contain the given domain configuration. * <li>The domain configuration has the same order xml name as the first inserted domain config. * </ol> * * @param job a given Job * @param cfg a given DomainConfiguration * @return true, if the given DomainConfiguration can be inserted into the given job */ private boolean checkAddDomainConfInvariant(Job job, DomainConfiguration cfg) { ArgumentNotValid.checkNotNull(job, "job"); ArgumentNotValid.checkNotNull(cfg, "cfg"); // check if domain in DomainConfiguration cfg is not already in this job // domainName is used as key in domainConfigurationMap if (job.getDomainConfigurationMap().containsKey(cfg.getDomainName())) { if (log.isDebugEnabled()) { log.debug("Job already has a configuration for Domain '" + cfg.getDomainName() + "'."); } return false; } // check if template is same as this job. String orderXMLname = job.getOrderXMLName(); if (!orderXMLname.equals(cfg.getOrderXmlName())) { if (log.isDebugEnabled()) { log.debug( "This Job only accept configurations " + "using the harvest template '" + orderXMLname + "'. This configuration uses the harvest template '" + cfg.getOrderXmlName() + "'."); } return false; } return true; }
@Override public boolean postProcess(InputStream input, OutputStream output) { ArgumentNotValid.checkNotNull(input, "InputStream input"); ArgumentNotValid.checkNotNull(output, "OutputStream output"); // Let the loaded job handle the post processing. loadBatchJob(); return loadedJob.postProcess(input, output); }
/** * Retrieves a set with the name of the files with a specific ReplicaStoreState in a specific * replica. * * @param rep The replica where the files belong. * @param state The ReplicaStoreState for the files. * @return A set with the names of the files with a specific ReplicaStoreState in a specific * replica. * @throws ArgumentNotValid If the Replica or the ReplicaStoreState is null. */ @Override public Set<String> getAllFileNames(Replica rep, ReplicaStoreState state) throws ArgumentNotValid { ArgumentNotValid.checkNotNull(rep, "Replica rep"); ArgumentNotValid.checkNotNull(state, "ReplicaStoreState state"); // initialise the set Set<String> res = new HashSet<String>(); // put the collection of filenames into the set. res.addAll(database.retrieveFilenamesForReplicaEntries(rep.getId(), state)); // return the set. return res; }
/** * Message to signal from a BitarchiveServer to the BitarchiveMonitorServer that the Bit Archive * Application identified by BA_ApplicationId has completed its part of the batch job. * * <p>Holds status information: list of files processed and a list of ARC files (file names) on * which the batch job failed. * * @param to the channel to which this message is to be sent (must be a BAMON channel) * @param originatingBatchMsgId the Id field from the original batch message * @param status The object containing status info. */ public BatchEndedMessage(ChannelID to, String originatingBatchMsgId, BatchStatus status) { super(to, Channels.getError()); ArgumentNotValid.checkNotNull(to, "to"); ArgumentNotValid.checkNotNullOrEmpty(originatingBatchMsgId, "String originatingBatchMsgId"); ArgumentNotValid.checkNotNull(status, "BatchStatus status"); this.originatingBatchMsgId = originatingBatchMsgId; this.baApplicationId = status.getBitArchiveAppId(); this.rf = status.getResultFile(); this.noOfFilesProcessed = status.getNoOfFilesProcessed(); this.filesFailed = status.getFilesFailed(); this.exceptions = status.getExceptions(); }
/** * Assign a StoreMessage to a specific filename. If the filename is already associated with a * StoreMessage, then this StoreMessage will be overwritten by the new StoreMessage. * * @param filename The name of the file to have a StoreMessage assigned. * @param msg The StoreMessage to be assigned to a file. * @throws ArgumentNotValid If the StoreMessage is null or if the filename is either null or the * empty string. */ @Override public void setReplyInfo(String filename, StoreMessage msg) throws ArgumentNotValid { ArgumentNotValid.checkNotNull(msg, "StoreMessage msg"); ArgumentNotValid.checkNotNullOrEmpty(filename, "String filename"); // put into the map, and overwrite any existing mapping. storeEntries.put(filename, msg); }
/** * Unpacks and calls accept() on the message object. * * <p>This method catches <b>all</b> exceptions and logs them. * * @param msg a ObjectMessage */ public void onMessage(Message msg) { ArgumentNotValid.checkNotNull(msg, "Message msg"); log.trace("Message received:\n{}", msg.toString()); try { ((ArchiveMessage) JMSConnection.unpack(msg)).accept(this); } catch (ClassCastException e) { log.warn("Invalid message type", e); } catch (Throwable t) { log.warn("Error processing message '{}'", msg, t); } }
/** * Creates a new NetarkivetMessage. * * @param to the initial receiver of the message * @param replyTo the initial sender of the message * @throws ArgumentNotValid if to==replyTo, the replyTo parameter is a topic instead of a queue, * or there is a null parameter. */ protected NetarkivetMessage(ChannelID to, ChannelID replyTo) { ArgumentNotValid.checkNotNull(to, "to"); ArgumentNotValid.checkNotNull(replyTo, "replyTo"); if (to.getName().equals(replyTo.getName())) { throw new ArgumentNotValid("to and replyTo should not be equal."); } // Have not implemented replying to a topic because there is no use // for it in our current architecture if (Channels.isTopic(replyTo.getName())) { throw new ArgumentNotValid( "Reply channel must be queue but " + replyTo.toString() + " is a Topic"); } this.to = to; this.replyTo = replyTo; this.id = null; this.replyOfId = null; }
/** * Create a new batch job that runs the loaded class. * * @param classFile the classfile for the batch job we want to run. * @param arguments The arguments for the batchjobs. This can be null. * @throws ArgumentNotValid If the classfile is null. */ public LoadableFileBatchJob(File classFile, List<String> arguments) throws ArgumentNotValid { ArgumentNotValid.checkNotNull(classFile, "File classFile"); fileContents = FileUtils.readBinaryFile(classFile); fileName = classFile.getName(); if (arguments == null) { this.args = new ArrayList<String>(); } else { this.args = arguments; } loadBatchJob(); }
/** * Message to signal from a BitarchiveServer to the BitarchiveMonitorServer that the Bit Archive * Application identified by BA_ApplicationId has completed its part of the batch job. * * <p>Holds status information: list of files processed and a list of ARC files (file names) on * which the batch job failed. * * @param to the channel to which this message is to be sent (must be a BAMON channel) * @param baAppId Identifier for the machine sending this message, usually containing the IP * address and http port number * @param originatingBatchMsgId the Id field from the original batch message * @param rf he remote file reference containing the output of the batch job (may be null if no * output is generated). * @throws ArgumentNotValid If the BA_ApplicationId or the originatingBatchMsgId are null or * empty, or if the channel 'to' is null. */ public BatchEndedMessage( ChannelID to, String baAppId, String originatingBatchMsgId, RemoteFile rf) throws ArgumentNotValid { super(to, Channels.getError()); ArgumentNotValid.checkNotNull(to, "ChannelID to"); ArgumentNotValid.checkNotNullOrEmpty(baAppId, "String baAppId"); ArgumentNotValid.checkNotNullOrEmpty(originatingBatchMsgId, "String originatingBatchMsgId"); this.baApplicationId = baAppId; this.originatingBatchMsgId = originatingBatchMsgId; this.rf = rf; }
/** * Sets the store state of an entry to a specific value. * * @param filename The name of the file for the entry. * @param repChannelId The identification channel of the replica for the entry. * @param state The new state for the entry. * @throws ArgumentNotValid If the ReplicaStoreState is null, or if either the filename or the * replica identification channel is either null or the empty string. */ @Override public void setState(String filename, String repChannelId, ReplicaStoreState state) throws ArgumentNotValid { ArgumentNotValid.checkNotNull(state, "ReplicaStoreState state"); ArgumentNotValid.checkNotNullOrEmpty(filename, "String filename"); ArgumentNotValid.checkNotNullOrEmpty(repChannelId, "String repChannelId"); // retrieve the replica Replica rep = Channels.retrieveReplicaFromIdentifierChannel(repChannelId); // update the database. database.setReplicaStoreState(filename, rep.getId(), state); }
/** * Process a record on crawl log concerning the given domain to result. * * @param record The record to process. * @param os The output stream for the result. * @throws ArgumentNotValid on null parameters * @throws IOFailure on trouble processing the record. */ @Override public void processRecord(ArchiveRecordBase record, OutputStream os) { ArgumentNotValid.checkNotNull(record, "ArchiveRecordBase record"); ArgumentNotValid.checkNotNull(os, "OutputStream os"); BufferedReader arcreader = new BufferedReader(new InputStreamReader(record.getInputStream())); try { for (String line = arcreader.readLine(); line != null; line = arcreader.readLine()) { if (line.matches(regexp)) { os.write(line.getBytes("UTF-8")); os.write('\n'); } } } catch (IOException e) { throw new IOFailure("Unable to process (w)arc record", e); } finally { try { arcreader.close(); } catch (IOException e) { log.warn("unable to close arcreader probably", e); } } }
/** * Set the actual time when this job was started. * * <p>Sends a notification, if actualStart is set to a time after actualStop. * * @param actualStart A Date object representing the time when this job was started. */ public void setActualStart(Date actualStart) { ArgumentNotValid.checkNotNull(actualStart, "actualStart"); if (actualStop != null && actualStop.before(actualStart)) { log.warn( "Job(" + getJobID() + "): Start time (" + actualStart + ") is after end time: " + actualStop); } this.actualStart = (Date) actualStart.clone(); }
/** * Set the actual time when this job was stopped/completed. Sends a notification, if actualStop is * set to a time before actualStart. * * @param actualStop A Date object representing the time when this job was stopped. * @throws ArgumentNotValid */ public void setActualStop(Date actualStop) throws ArgumentNotValid { ArgumentNotValid.checkNotNull(actualStop, "actualStop"); if (actualStart == null) { log.warn("Job(" + getJobID() + "): actualStart should be defined before setting actualStop"); } else if (actualStop.before(actualStart)) { log.warn( "Job(" + getJobID() + "): actualStop (" + actualStop + ") is before actualStart: " + actualStart); } this.actualStop = (Date) actualStop.clone(); }
/** * Sets status of this job. * * @param newStatus Must be one of the values STATUS_NEW, ..., STATUS_FAILED * @throws ArgumentNotValid in case of invalid status argument or invalid status change */ public void setStatus(JobStatus newStatus) { ArgumentNotValid.checkNotNull(newStatus, "newStatus"); if (!status.legalChange(newStatus)) { final String message = "Status change from " + status + " to " + newStatus + " is not allowed"; log.debug(message); throw new ArgumentNotValid(message); } if ((this.status == JobStatus.NEW || this.status == JobStatus.RESUBMITTED) && newStatus == JobStatus.SUBMITTED) { orderXMLdoc.configureQuotaEnforcer( maxObjectsIsSetByQuotaEnforcer, forceMaxBytesPerDomain, forceMaxObjectsPerDomain); } if (this.status == JobStatus.SUBMITTED && newStatus == JobStatus.STARTED) { setActualStart(new Date()); } if (this.status == JobStatus.STARTED && (newStatus == JobStatus.DONE || newStatus == JobStatus.FAILED)) { setActualStop(new Date()); } status = newStatus; }
/** * This method should be overridden and implemented by a sub class if message handling is wanted. * * @param msg an GetAllFilenamesMessage * @throws PermissionDenied when invoked */ public void visit(GetAllFilenamesMessage msg) throws PermissionDenied { ArgumentNotValid.checkNotNull(msg, "msg"); deny(msg); }
/** * Set the orderxml for this job. * * @param doc A orderxml to be used by this job */ public void setOrderXMLDoc(HeritrixTemplate doc) { ArgumentNotValid.checkNotNull(doc, "doc"); this.orderXMLdoc = doc; }
/** * Package private constructor for common initialisation. * * @param harvestID the id of the harvestdefinition * @param cfg the configuration to base the Job on * @param orderXMLdoc * @param channel the channel on which the job will be submitted. * @param forceMaxObjectsPerDomain the maximum number of objects harvested from a domain, * overrides individual configuration settings. -1 means no limit * @param forceMaxBytesPerDomain The maximum number of objects harvested from a domain, or -1 for * no limit. * @param forceMaxJobRunningTime The max time in seconds given to the harvester for this job * @param harvestNum the run number of the harvest definition * @throws ArgumentNotValid if cfg or priority is null or harvestID is invalid, or if any limit < * -1 */ public Job( Long harvestID, DomainConfiguration cfg, HeritrixTemplate orderXMLdoc, HarvestChannel channel, long forceMaxObjectsPerDomain, long forceMaxBytesPerDomain, long forceMaxJobRunningTime, int harvestNum) throws ArgumentNotValid { ArgumentNotValid.checkNotNull(cfg, "cfg"); ArgumentNotValid.checkNotNull(harvestID, "harvestID"); ArgumentNotValid.checkNotNegative(harvestID, "harvestID"); ArgumentNotValid.checkNotNull(channel, "channel"); if (forceMaxObjectsPerDomain < -1) { String msg = "forceMaxObjectsPerDomain must be either -1 or positive"; log.debug(msg); throw new ArgumentNotValid(msg); } if (forceMaxBytesPerDomain < -1) { String msg = "forceMaxBytesPerDomain must be either -1 or positive"; log.debug(msg); throw new ArgumentNotValid(msg); } if (forceMaxBytesPerDomain == 0L) { log.warn( "forceMaxBytesPerDomain should probably not be 0.Means 0 bytes downloaded per domain"); } if (forceMaxObjectsPerDomain == 0L) { log.warn( "forceMaxObjectsPerDomain should probably not be 0.Means 0 objects downloaded per domain"); } // setup initial members domainConfigurationMap = new HashMap<>(); origHarvestDefinitionID = harvestID; orderXMLname = cfg.getOrderXmlName(); this.orderXMLdoc = orderXMLdoc; setHarvestChannel(channel); long maxObjects = NumberUtils.minInf(forceMaxObjectsPerDomain, cfg.getMaxObjects()); setMaxObjectsPerDomain(maxObjects); configurationSetsObjectLimit = (maxObjects != forceMaxObjectsPerDomain); long maxBytes = NumberUtils.minInf(forceMaxBytesPerDomain, cfg.getMaxBytes()); setMaxBytesPerDomain(maxBytes); configurationSetsByteLimit = (maxBytes != forceMaxBytesPerDomain); long expectation = cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain); maxCountObjects = expectation; minCountObjects = expectation; this.harvestNum = harvestNum; addConfiguration(cfg); setMaxJobRunningTime(forceMaxJobRunningTime); setArchiveFormatInTemplate(Settings.get(HarvesterSettings.HERITRIX_ARCHIVE_FORMAT)); setAttributes(cfg.getAttributesAndTypes()); orderXMLdoc.enableOrDisableDeduplication( Settings.getBoolean(HarvesterSettings.DEDUPLICATION_ENABLED)); status = JobStatus.NEW; }
/** * Adds a configuration to this Job. Seedlists and settings are updated accordingly. * * @param cfg the configuration to add * @throws ArgumentNotValid if cfg is null or cfg uses a different orderxml than this job or if * this job already contains a configuration associated with domain of configuration cfg. */ public void addConfiguration(DomainConfiguration cfg) { ArgumentNotValid.checkNotNull(cfg, "cfg"); if (domainConfigurationMap.containsKey(cfg.getDomainName())) { throw new ArgumentNotValid( "Job already has a configuration for Domain " + cfg.getDomainName()); } if (log.isTraceEnabled()) { log.trace("Adding configuration '{}' to job '{}'", cfg, cfg.getName()); } if (!underConstruction) { final String msg = "Cannot modify job " + this + " as it is no longer under construction"; log.debug(msg); throw new IllegalState(msg); } if (!cfg.getOrderXmlName().equals(getOrderXMLName())) { throw new ArgumentNotValid( "Job requires the orderxml file:'" + getOrderXMLName() + "' not:'" + cfg.getOrderXmlName() + "' used by the configuration:'" + cfg.getName()); } domainConfigurationMap.put(cfg.getDomainName(), cfg.getName()); // Add the seeds from the configuration to the Job seeds. // Take care of duplicates. for (Iterator<SeedList> itt = cfg.getSeedLists(); itt.hasNext(); ) { SeedList seed = itt.next(); List<String> seeds = seed.getSeeds(); for (String seedUrl : seeds) { seedListSet.add(seedUrl); // duplicates is silently ignored // TODO remove when heritrix implements this functionality // try to convert a seed into a Internationalized Domain Name try { String seedASCII = seedUrl; // It is rare to see these seeds, but they need to be // correctly idnaized if (seedUrl.contains(":") || seedUrl.contains("/")) { String normalizedUrl = seedUrl; if (!normalizedUrl.matches("^[a-zA-Z]+:.*")) { // If no protocol is given, assume http normalizedUrl = "http://" + normalizedUrl; } URL url = new URL(normalizedUrl); String domainName = url.getHost(); String domainNameASCII = IDNA.toASCII(domainName); if (!domainName.equals(domainNameASCII)) { // If the domain name changed, replace that in the // seed. seedASCII = seedUrl.replaceFirst(Pattern.quote(domainName), domainNameASCII); } } else { seedASCII = IDNA.toASCII(seedUrl); } if (!seedASCII.equals(seedUrl)) { log.trace("Converted {} to {}", seedUrl, seedASCII); // Note that duplicates is silently ignored seedListSet.add(seedASCII); } } catch (IDNAException e) { log.trace("Cannot convert seed {} to ASCII", seedUrl, e); } catch (MalformedURLException e) { log.trace("Cannot convert seed {} to ASCII", seedUrl, e); } } } orderXMLdoc.editOrderXMLAddPerDomainCrawlerTraps(cfg); // TODO update limits in settings files - see also bug 269 // Update estimates of job size long expectation = cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain); maxCountObjects = Math.max(expectation, maxCountObjects); minCountObjects = Math.min(expectation, minCountObjects); totalCountObjects += expectation; configsChanged = true; assert (maxCountObjects >= minCountObjects) : "basic invariant"; }
/** * Initialize the job before runnning. This is called before the processFile() calls. * * @param os the OutputStream to which output should be written */ public void initialize(OutputStream os) { ArgumentNotValid.checkNotNull(os, "OutputStream os"); loadBatchJob(); loadedJob.initialize(os); }
/** * Process one file stored in the bit archive. * * @param file the file to be processed. * @param os the OutputStream to which output should be written * @return true if the file was successfully processed, false otherwise */ public boolean processFile(File file, OutputStream os) { log.trace("Started processing of file '" + file.getAbsolutePath() + "'."); ArgumentNotValid.checkNotNull(file, "File file"); ArgumentNotValid.checkNotNull(os, "OutputStream os"); return loadedJob.processFile(file, os); }
/** * Finish up the job. This is called after the last process() call. * * @param os the OutputStream to which output should be written */ public void finish(OutputStream os) { ArgumentNotValid.checkNotNull(os, "OutputStream os"); loadedJob.finish(os); }
/** * This method should be overridden and implemented by a sub class if message handling is wanted. * * @param msg an CorrectMessage for correcting a record. * @throws PermissionDenied when invoked */ public void visit(CorrectMessage msg) throws PermissionDenied { ArgumentNotValid.checkNotNull(msg, "msg"); deny(msg); }
/** * This method should be overridden and implemented by a sub class if message handling is wanted. * * @param msg a RemoveAndGetFile * @throws PermissionDenied when invoked */ public void visit(RemoveAndGetFileMessage msg) throws PermissionDenied { ArgumentNotValid.checkNotNull(msg, "msg"); deny(msg); }