/** Update the order template according to the chosen archive format (arc/warc). */ private void setArchiveFormatInTemplate(String archiveFormat) { if (!underConstruction) { final String msg = "Cannot modify job " + this + " as it is no longer under construction"; log.debug(msg); throw new IllegalState(msg); } orderXMLdoc.setArchiveFormat(archiveFormat); }
/** * Set the maxJobRunningTime value. * * @param maxJobRunningTime The maxJobRunningTime in seconds to set, or 0 for no limit. */ protected void setMaxJobRunningTime(long maxJobRunningTime) { if (!underConstruction) { final String msg = "Cannot modify job " + this + " as it is no longer under construction"; log.debug(msg); throw new IllegalState(msg); } this.forceMaxRunningTime = maxJobRunningTime; orderXMLdoc.setMaxJobRunningTime(maxJobRunningTime); }
/** * Set the maxbytes per domain value. * * @param maxBytesPerDomain The maxBytesPerDomain to set, or -1 for no limit. */ protected void setMaxBytesPerDomain(long maxBytesPerDomain) { if (!underConstruction) { final String msg = "Cannot modify job " + this + " as it is no longer under construction"; log.debug(msg); throw new IllegalState(msg); } this.forceMaxBytesPerDomain = maxBytesPerDomain; orderXMLdoc.setMaxBytesPerDomain(maxBytesPerDomain); if (0L == maxBytesPerDomain && 0L != forceMaxObjectsPerDomain) { setMaxObjectsPerDomain(0L); } }
/** * Sets the maxObjectsPerDomain value. * * @param maxObjectsPerDomain The forceMaxObjectsPerDomain to set. 0 means no limit. * @throws IOFailure Thrown from auxiliary method editOrderXML_maxObjectsPerDomain. */ protected void setMaxObjectsPerDomain(long maxObjectsPerDomain) { if (!underConstruction) { final String msg = "Cannot modify job " + this + " as it is no longer under construction"; log.debug(msg); throw new IllegalState(msg); } this.forceMaxObjectsPerDomain = maxObjectsPerDomain; orderXMLdoc.setMaxObjectsPerDomain( maxObjectsPerDomain); // FIXME? add argument to maxObjectsIsSetByQuotaEnforcer to method // setMaxObjectsPerDomain // orderXMLdoc.editOrderXML_maxObjectsPerDomain(orderXMLdoc, maxObjectsPerDomain, // maxObjectsIsSetByQuotaEnforcer); if (0L == maxObjectsPerDomain && 0L != forceMaxBytesPerDomain) { setMaxBytesPerDomain(0L); } }
/** * Sets status of this job. * * @param newStatus Must be one of the values STATUS_NEW, ..., STATUS_FAILED * @throws ArgumentNotValid in case of invalid status argument or invalid status change */ public void setStatus(JobStatus newStatus) { ArgumentNotValid.checkNotNull(newStatus, "newStatus"); if (!status.legalChange(newStatus)) { final String message = "Status change from " + status + " to " + newStatus + " is not allowed"; log.debug(message); throw new ArgumentNotValid(message); } if ((this.status == JobStatus.NEW || this.status == JobStatus.RESUBMITTED) && newStatus == JobStatus.SUBMITTED) { orderXMLdoc.configureQuotaEnforcer( maxObjectsIsSetByQuotaEnforcer, forceMaxBytesPerDomain, forceMaxObjectsPerDomain); } if (this.status == JobStatus.SUBMITTED && newStatus == JobStatus.STARTED) { setActualStart(new Date()); } if (this.status == JobStatus.STARTED && (newStatus == JobStatus.DONE || newStatus == JobStatus.FAILED)) { setActualStop(new Date()); } status = newStatus; }
/** * Adds a configuration to this Job. Seedlists and settings are updated accordingly. * * @param cfg the configuration to add * @throws ArgumentNotValid if cfg is null or cfg uses a different orderxml than this job or if * this job already contains a configuration associated with domain of configuration cfg. */ public void addConfiguration(DomainConfiguration cfg) { ArgumentNotValid.checkNotNull(cfg, "cfg"); if (domainConfigurationMap.containsKey(cfg.getDomainName())) { throw new ArgumentNotValid( "Job already has a configuration for Domain " + cfg.getDomainName()); } if (log.isTraceEnabled()) { log.trace("Adding configuration '{}' to job '{}'", cfg, cfg.getName()); } if (!underConstruction) { final String msg = "Cannot modify job " + this + " as it is no longer under construction"; log.debug(msg); throw new IllegalState(msg); } if (!cfg.getOrderXmlName().equals(getOrderXMLName())) { throw new ArgumentNotValid( "Job requires the orderxml file:'" + getOrderXMLName() + "' not:'" + cfg.getOrderXmlName() + "' used by the configuration:'" + cfg.getName()); } domainConfigurationMap.put(cfg.getDomainName(), cfg.getName()); // Add the seeds from the configuration to the Job seeds. // Take care of duplicates. for (Iterator<SeedList> itt = cfg.getSeedLists(); itt.hasNext(); ) { SeedList seed = itt.next(); List<String> seeds = seed.getSeeds(); for (String seedUrl : seeds) { seedListSet.add(seedUrl); // duplicates is silently ignored // TODO remove when heritrix implements this functionality // try to convert a seed into a Internationalized Domain Name try { String seedASCII = seedUrl; // It is rare to see these seeds, but they need to be // correctly idnaized if (seedUrl.contains(":") || seedUrl.contains("/")) { String normalizedUrl = seedUrl; if (!normalizedUrl.matches("^[a-zA-Z]+:.*")) { // If no protocol is given, assume http normalizedUrl = "http://" + normalizedUrl; } URL url = new URL(normalizedUrl); String domainName = url.getHost(); String domainNameASCII = IDNA.toASCII(domainName); if (!domainName.equals(domainNameASCII)) { // If the domain name changed, replace that in the // seed. seedASCII = seedUrl.replaceFirst(Pattern.quote(domainName), domainNameASCII); } } else { seedASCII = IDNA.toASCII(seedUrl); } if (!seedASCII.equals(seedUrl)) { log.trace("Converted {} to {}", seedUrl, seedASCII); // Note that duplicates is silently ignored seedListSet.add(seedASCII); } } catch (IDNAException e) { log.trace("Cannot convert seed {} to ASCII", seedUrl, e); } catch (MalformedURLException e) { log.trace("Cannot convert seed {} to ASCII", seedUrl, e); } } } orderXMLdoc.editOrderXMLAddPerDomainCrawlerTraps(cfg); // TODO update limits in settings files - see also bug 269 // Update estimates of job size long expectation = cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain); maxCountObjects = Math.max(expectation, maxCountObjects); minCountObjects = Math.min(expectation, minCountObjects); totalCountObjects += expectation; configsChanged = true; assert (maxCountObjects >= minCountObjects) : "basic invariant"; }
public void setAttributes(List<AttributeAndType> attributesAndTypes) { orderXMLdoc.insertAttributes(attributesAndTypes); }
/** * Package private constructor for common initialisation. * * @param harvestID the id of the harvestdefinition * @param cfg the configuration to base the Job on * @param orderXMLdoc * @param channel the channel on which the job will be submitted. * @param forceMaxObjectsPerDomain the maximum number of objects harvested from a domain, * overrides individual configuration settings. -1 means no limit * @param forceMaxBytesPerDomain The maximum number of objects harvested from a domain, or -1 for * no limit. * @param forceMaxJobRunningTime The max time in seconds given to the harvester for this job * @param harvestNum the run number of the harvest definition * @throws ArgumentNotValid if cfg or priority is null or harvestID is invalid, or if any limit < * -1 */ public Job( Long harvestID, DomainConfiguration cfg, HeritrixTemplate orderXMLdoc, HarvestChannel channel, long forceMaxObjectsPerDomain, long forceMaxBytesPerDomain, long forceMaxJobRunningTime, int harvestNum) throws ArgumentNotValid { ArgumentNotValid.checkNotNull(cfg, "cfg"); ArgumentNotValid.checkNotNull(harvestID, "harvestID"); ArgumentNotValid.checkNotNegative(harvestID, "harvestID"); ArgumentNotValid.checkNotNull(channel, "channel"); if (forceMaxObjectsPerDomain < -1) { String msg = "forceMaxObjectsPerDomain must be either -1 or positive"; log.debug(msg); throw new ArgumentNotValid(msg); } if (forceMaxBytesPerDomain < -1) { String msg = "forceMaxBytesPerDomain must be either -1 or positive"; log.debug(msg); throw new ArgumentNotValid(msg); } if (forceMaxBytesPerDomain == 0L) { log.warn( "forceMaxBytesPerDomain should probably not be 0.Means 0 bytes downloaded per domain"); } if (forceMaxObjectsPerDomain == 0L) { log.warn( "forceMaxObjectsPerDomain should probably not be 0.Means 0 objects downloaded per domain"); } // setup initial members domainConfigurationMap = new HashMap<>(); origHarvestDefinitionID = harvestID; orderXMLname = cfg.getOrderXmlName(); this.orderXMLdoc = orderXMLdoc; setHarvestChannel(channel); long maxObjects = NumberUtils.minInf(forceMaxObjectsPerDomain, cfg.getMaxObjects()); setMaxObjectsPerDomain(maxObjects); configurationSetsObjectLimit = (maxObjects != forceMaxObjectsPerDomain); long maxBytes = NumberUtils.minInf(forceMaxBytesPerDomain, cfg.getMaxBytes()); setMaxBytesPerDomain(maxBytes); configurationSetsByteLimit = (maxBytes != forceMaxBytesPerDomain); long expectation = cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain); maxCountObjects = expectation; minCountObjects = expectation; this.harvestNum = harvestNum; addConfiguration(cfg); setMaxJobRunningTime(forceMaxJobRunningTime); setArchiveFormatInTemplate(Settings.get(HarvesterSettings.HERITRIX_ARCHIVE_FORMAT)); setAttributes(cfg.getAttributesAndTypes()); orderXMLdoc.enableOrDisableDeduplication( Settings.getBoolean(HarvesterSettings.DEDUPLICATION_ENABLED)); status = JobStatus.NEW; }