Example #1
0
 public int computeStep(int metric) {
   int work = 0;
   if (nSteps == whenToThrow) {
     throw new ExpectedRuntimeException("Hash step throw test");
   }
   if (nSteps-- > 0) {
     if (eachStepTime > 0) {
       Deadline time = Deadline.in(eachStepTime);
       while (!time.expired()) {
         try {
           Thread.sleep(1);
         } catch (InterruptedException e) {
           throw new RuntimeException(e.toString());
         }
         work++;
       }
     } else {
       work = -eachStepTime;
       TimeBase.step(work);
       try {
         Thread.sleep(1);
       } catch (InterruptedException e) {
         throw new RuntimeException(e.toString());
       }
     }
   }
   return work;
 }
 private void pauseBeforeFetch() {
   if (!fetchDeadline.expired()) {
     try {
       fetchDeadline.sleep();
     } catch (InterruptedException ie) {
       // no action
     }
   }
   fetchDeadline.expireIn(m_crawlDelay);
 }
  public void testStoreNodeState() throws Exception {
    TimeBase.setSimulated(100);
    CachedUrlSet mcus =
        new MockCachedUrlSet(mau, new RangeCachedUrlSetSpec("http://www.example.com"));
    CrawlState crawl = new CrawlState(1, 2, 123);
    List polls = new ArrayList(2);
    PollState poll1 = new PollState(1, "sdf", "jkl", 2, 123, Deadline.at(456), false);
    PollState poll2 = new PollState(2, "abc", "def", 3, 321, Deadline.at(654), false);
    polls.add(poll1);
    polls.add(poll2);
    NodeState nodeState = new NodeStateImpl(mcus, 123321, crawl, polls, repository);
    ((NodeStateImpl) nodeState).setState(NodeState.DAMAGE_AT_OR_BELOW);
    repository.storeNodeState(nodeState);
    String filePath = LockssRepositoryImpl.mapAuToFileLocation(tempDirPath, mau);
    filePath =
        LockssRepositoryImpl.mapUrlToFileLocation(
            filePath, "http://www.example.com/" + HistoryRepositoryImpl.NODE_FILE_NAME);
    File xmlFile = new File(filePath);
    assertTrue(xmlFile.exists());

    nodeState = null;
    nodeState = repository.loadNodeState(mcus);
    assertSame(mcus, nodeState.getCachedUrlSet());

    assertEquals(123321, nodeState.getAverageHashDuration());
    assertEquals(1, nodeState.getCrawlState().getType());
    assertEquals(2, nodeState.getCrawlState().getStatus());
    assertEquals(123, nodeState.getCrawlState().getStartTime());
    assertEquals(NodeState.DAMAGE_AT_OR_BELOW, nodeState.getState());

    Iterator pollIt = nodeState.getActivePolls();
    assertTrue(pollIt.hasNext());
    PollState loadedPoll = (PollState) pollIt.next();
    assertEquals(1, loadedPoll.getType());
    assertEquals("sdf", loadedPoll.getLwrBound());
    assertEquals("jkl", loadedPoll.getUprBound());
    assertEquals(2, loadedPoll.getStatus());
    assertEquals(123, loadedPoll.getStartTime());
    assertEquals(456, loadedPoll.getDeadline().getExpirationTime());

    assertTrue(pollIt.hasNext());
    loadedPoll = (PollState) pollIt.next();
    assertEquals(2, loadedPoll.getType());
    assertEquals("abc", loadedPoll.getLwrBound());
    assertEquals("def", loadedPoll.getUprBound());
    assertEquals(3, loadedPoll.getStatus());
    assertEquals(321, loadedPoll.getStartTime());
    assertEquals(654, loadedPoll.getDeadline().getExpirationTime());
    assertFalse(pollIt.hasNext());

    TimeBase.setReal();
  }
 /**
  * Wait until the semaphore is full or the timer expires. If the semaphore is already full, return
  * immediately. Use {@link Deadline#expire()} to make this return early. Does not change the state
  * of the semaphore.
  *
  * @param timer time to wait. If null, returns immediately.
  * @return true if <code>take()</code> was successful (semaphore was or became full), else false
  *     (timer expired).
  * @throws InterruptedException if interrupted while waiting
  */
 public synchronized boolean waitFull(Deadline timer) throws InterruptedException {
   if (timer != null) {
     Deadline.InterruptCallback cb = new Deadline.InterruptCallback();
     try {
       timer.registerCallback(cb);
       while (!state && !timer.expired()) {
         this.wait(timer.getSleepTime());
       }
     } finally {
       cb.disable();
       timer.unregisterCallback(cb);
     }
   }
   return state;
 }
Example #5
0
 public boolean equals(Object obj) {
   if (obj instanceof BERec) {
     BERec o = (BERec) obj;
     return when.equals(o.when) && task.equals(o.task) && event == o.event;
   }
   return false;
 }
Example #6
0
  public void testFindOverrunTaskToRun() {
    assertFalse(tr.findTaskToRun());
    StepTask t1 = task(100, 200, 100);
    Schedule s = sched(ListUtil.list(t1));
    fact.setResult(s);
    assertTrue(tr.addToSchedule(t1));
    assertFalse(tr.findTaskToRun());
    assertEquals(Deadline.at(100), tr.runningDeadline);

    StepTask t2 = task(0, 300, 50);
    tr.addOverrunner(t2);
    assertTrue(tr.findTaskToRun());
    assertEquals(t2, tr.runningTask);
    assertEquals(Deadline.at(100), tr.runningDeadline);
    assertNull(tr.runningChunk);
  }
Example #7
0
 HashQueue.Request req(
     Object cookie, long deadlineIn, int duration, int bytes, HashService.Callback callback) {
   MockCachedUrlSetHasher hasher = new MockCachedUrlSetHasher();
   hasher.setNumBytes(bytes);
   cus.setContentHasher(hasher);
   //      cus.setHashDuration(duration, bytes);
   HashQueue.Request req =
       new HashQueue.Request(cus, Deadline.in(deadlineIn), callback, cookie, hasher, duration);
   return req;
 }
    public void lockssRun() {
      setPriority(PRIORITY_PARAM_SIZE_CALC, PRIORITY_DEFAULT_SIZE_CALC);
      startWDog(WDOG_PARAM_SIZE_CALC, WDOG_DEFAULT_SIZE_CALC);
      triggerWDogOnExit(true);
      nowRunning();

      while (goOn) {
        try {
          pokeWDog();
          if (sizeCalcQueue.isEmpty()) {
            Deadline timeout = Deadline.in(Constants.HOUR);
            sizeCalcSem.take(timeout);
          }
          RepositoryNode node;
          synchronized (sizeCalcQueue) {
            node = (RepositoryNode) CollectionUtil.getAnElement(sizeCalcQueue);
          }
          if (node != null) {
            long start = TimeBase.nowMs();
            log.debug2("CalcSize start: " + node);
            long dur = 0;
            try {
              doSizeCalc(node);
              dur = TimeBase.nowMs() - start;
              log.debug2("CalcSize finish (" + StringUtil.timeIntervalToString(dur) + "): " + node);
            } catch (RuntimeException e) {
              log.warning("doSizeCalc: " + node, e);
            }
            synchronized (sizeCalcQueue) {
              sizeCalcQueue.remove(node);
            }
            pokeWDog();
            long sleep = sleepTimeToAchieveLoad(dur, sizeCalcMaxLoad);
            Deadline.in(sleep).sleep();
          }
        } catch (InterruptedException e) {
          // just wakeup and check for exit
        }
      }
      if (!goOn) {
        triggerWDogOnExit(false);
      }
    }
Example #9
0
 private void doSleep() throws IOException {
   String timestr = getParameter(KEY_TIME);
   try {
     long time = StringUtil.parseTimeInterval(timestr);
     Deadline.in(time).sleep();
     statusMsg = "Slept for " + StringUtil.timeIntervalToString(time);
   } catch (NumberFormatException e) {
     errMsg = "Illegal duration: " + e;
   } catch (InterruptedException e) {
     errMsg = "Interrupted: " + e;
   }
 }
Example #10
0
 public void testGetAvailableHashTimeBefore() {
   HashQueue q = new HashQueue();
   assertEquals(500, q.getAvailableHashTimeBefore(Deadline.in(500)));
   HashQueue.Request r1, r2, r3, r4, r5, r6, r7;
   r1 = simpleReq(200, 100);
   r2 = simpleReq(2000, 1200);
   r3 = simpleReq(3000, 500);
   assertTrue(q.insert(r1));
   assertTrue(q.insert(r2));
   assertTrue(q.insert(r3));
   assertEquals(100, q.getAvailableHashTimeBefore(Deadline.in(100)));
   assertEquals(400, q.getAvailableHashTimeBefore(Deadline.in(500)));
   assertEquals(700, q.getAvailableHashTimeBefore(Deadline.in(1000)));
   assertEquals(700, q.getAvailableHashTimeBefore(Deadline.in(2000)));
   assertEquals(1200, q.getAvailableHashTimeBefore(Deadline.in(3000)));
   assertEquals(2200, q.getAvailableHashTimeBefore(Deadline.in(4000)));
   // this will fully commit first 200 ms
   r4 = simpleReq(200, 100);
   assertTrue(q.insert(r4));
   assertEquals(0, q.getAvailableHashTimeBefore(Deadline.in(100)));
   assertEquals(0, q.getAvailableHashTimeBefore(Deadline.in(0)));
 }
Example #11
0
  public void testFindChunkTaskToRun() {
    assertFalse(tr.findTaskToRun());
    StepTask t1 = task(100, 200, 100);
    StepTask t2 = task(100, 300, 50);

    Schedule s = sched(ListUtil.list(t1, t2));
    fact.setResults(s, s);
    assertTrue(tr.addToSchedule(t1));
    assertTrue(tr.addToSchedule(t2));
    assertFalse(tr.findTaskToRun());
    assertEquals(Deadline.at(100), tr.runningDeadline);

    TimeBase.setSimulated(101);
    assertTrue(tr.findTaskToRun());
    assertEquals(t1, tr.runningTask);
    assertEquals(t1.getLatestFinish(), tr.runningDeadline);
    assertEquals(s.getEvents().get(0), tr.runningChunk);
  }
Example #12
0
 public void testFindRunnableChunk() {
   assertFalse(tr.findTaskToRun());
   StepTask t1 = task(100, 200, 100);
   StepTask t2 = task(10, 300, 50);
   Schedule.Chunk c1 = new Schedule.Chunk(t1, Deadline.at(100), Deadline.at(200), 100);
   Schedule.Chunk c2 = new Schedule.Chunk(t2, Deadline.at(200), Deadline.at(300), 100);
   Schedule s = new Schedule(ListUtil.list(c1, c2));
   fact.setResults(s, s);
   assertTrue(tr.addToSchedule(t1));
   assertTrue(tr.addToSchedule(t2));
   assertFalse(tr.findTaskToRun());
   assertEquals(Deadline.at(100), tr.runningDeadline);
   TimeBase.setSimulated(11);
   assertTrue(tr.findTaskToRun());
   assertEquals(t2, tr.runningTask);
   assertEquals(c2, tr.runningChunk);
   assertEquals(Deadline.at(100), tr.runningDeadline);
   assertEquals(s.getEvents().get(1), tr.runningChunk);
 }
Example #13
0
  public void testBackground() {
    final List rec = new ArrayList();
    TaskCallback cb =
        new TaskCallback() {
          public void taskEvent(SchedulableTask task, Schedule.EventType event) {
            rec.add(new BERec(Deadline.in(0), (BackgroundTask) task, event));
          }
        };
    assertFalse(tr.findTaskToRun());
    BackgroundTask t1 = btask(100, 200, .1, cb);
    BackgroundTask t2 = btask(100, 300, .2, cb);
    BackgroundTask t3 = btask(150, 200, .4, cb);

    Schedule s =
        sched(
            ListUtil.list(
                bEvent(t1, Schedule.EventType.START),
                bEvent(t2, Schedule.EventType.START),
                bEvent(t3, Schedule.EventType.START),
                bEvent(t1, Schedule.EventType.FINISH),
                bEvent(t3, Schedule.EventType.FINISH),
                bEvent(t2, Schedule.EventType.FINISH)));
    fact.setResults(ListUtil.list(s, s, s));

    assertTrue(tr.addToSchedule(t1));
    assertTrue(tr.addToSchedule(t2));
    assertTrue(tr.addToSchedule(t3));
    assertEquals(3, tr.getAcceptedTasks().size());
    assertIsomorphic(ListUtil.list(t1, t2, t3), tr.getAcceptedTasks());
    assertFalse(tr.findTaskToRun());
    assertEquals(0, rec.size());
    assertEquals(0, tr.getBackgroundLoadFactor(), .005);
    assertEquals(Deadline.at(100), tr.runningDeadline);

    TimeBase.setSimulated(101);
    assertFalse(tr.findTaskToRun());
    assertEquals(2, rec.size());
    assertEquals(.3, tr.getBackgroundLoadFactor(), .005);
    TimeBase.setSimulated(151);
    assertFalse(tr.findTaskToRun());
    assertEquals(3, rec.size());
    assertEquals(.7, tr.getBackgroundLoadFactor(), .005);
    assertEquals(3, tr.getAcceptedTasks().size());
    TimeBase.setSimulated(201);
    assertFalse(tr.findTaskToRun());
    assertEquals(5, rec.size());
    assertEquals(.2, tr.getBackgroundLoadFactor(), .005);
    assertEquals(1, tr.getAcceptedTasks().size());
    t2.taskIsFinished();
    TimeBase.setSimulated(202);
    assertFalse(tr.findTaskToRun());
    assertEquals(6, rec.size());
    assertEquals(0, tr.getBackgroundLoadFactor(), .005);
    assertEquals(0, tr.getAcceptedTasks().size());
    TimeBase.setSimulated(301);
    assertFalse(tr.findTaskToRun());
    assertEquals(6, rec.size());
    assertEquals(0, tr.getBackgroundLoadFactor(), .005);
    List exp =
        ListUtil.list(
            new BERec(101, t1, Schedule.EventType.START),
            new BERec(101, t2, Schedule.EventType.START),
            new BERec(151, t3, Schedule.EventType.START),
            new BERec(201, t1, Schedule.EventType.FINISH),
            new BERec(201, t3, Schedule.EventType.FINISH),
            new BERec(201, t2, Schedule.EventType.FINISH));
    assertEquals(exp, rec);
  }
Example #14
0
 static StepperTask taskBetween(long minStart, long deadline, int duration, Stepper stepper) {
   return new StepperTask(
       Deadline.at(minStart), Deadline.at(deadline), duration, null, null, stepper);
 }
Example #15
0
 HashQueue.Request simpleReq(long deadlineIn, int duration) {
   return new HashQueue.Request(
       cus, Deadline.in(deadlineIn), null, null, new GenericContentHasher(cus, dig), duration);
 }
Example #16
0
 BackgroundTask btask(long start, long end, double loadFactor, TaskCallback cb) {
   return new BackgroundTask(Deadline.at(start), Deadline.at(end), loadFactor, cb);
 }
Example #17
0
 StepTask task(long start, long end, long duration, TaskCallback cb, Stepper stepper) {
   return new StepperTask(Deadline.at(start), Deadline.at(end), duration, cb, null, stepper);
 }
Example #18
0
public class CrawlRuleTester extends Thread {
  protected static Logger log = Logger.getLogger(CrawlRuleTester.class);

  /** Proxy host */
  public static final String PARAM_PROXY_HOST = Configuration.PREFIX + "crawltest.proxy.host";

  /** Proxy port */
  public static final String PARAM_PROXY_PORT = Configuration.PREFIX + "crawltest.proxy.port";

  public static final int DEFAULT_PROXY_PORT = -1;

  /** User-Agent */
  public static final String PARAM_USER_AGENT = Configuration.PREFIX + "crawltest.userAgent";

  /* Message Types */
  public static final int ERROR_MESSAGE = 0;
  public static final int WARNING_MESSAGE = 1;
  public static final int PLAIN_MESSAGE = 2;
  public static final int URL_SUMMARY_MESSAGE = 3;
  public static final int TEST_SUMMARY_MESSAGE = 4;

  private String m_baseUrl;
  private int m_crawlDepth;
  private long m_crawlDelay;
  private int m_curDepth;
  private ArchivalUnit m_au;
  private String m_outputFile = null;
  private BufferedWriter m_outWriter = null;
  private Deadline fetchDeadline = Deadline.in(0);
  private boolean useLocalWriter = true;
  private MessageHandler m_msgHandler;
  private LockssUrlConnectionPool connectionPool = new LockssUrlConnectionPool();
  private String proxyHost;
  private String userAgent;
  private int proxyPort;

  // our storage for extracted urls
  private TreeSet m_extracted = new TreeSet();
  private TreeSet m_incls = new TreeSet();
  private TreeSet m_excls = new TreeSet();
  private TreeSet m_reported = new TreeSet();

  public CrawlRuleTester(int crawlDepth, long crawlDelay, String baseUrl, ArchivalUnit au) {
    super("crawlrule tester");
    m_crawlDepth = crawlDepth;
    long minFetchDelay =
        CurrentConfig.getLongParam(
            BaseArchivalUnit.PARAM_MIN_FETCH_DELAY, BaseArchivalUnit.DEFAULT_MIN_FETCH_DELAY);
    m_crawlDelay = Math.max(crawlDelay, minFetchDelay);
    m_baseUrl = baseUrl;
    m_au = au;
  }
  /**
   * RuleTest
   *
   * @param outFile String
   * @param crawlDepth int
   * @param crawlDelay long
   * @param baseUrl String
   * @param crawlSpec CrawlSpec
   */
  public CrawlRuleTester(
      String outFile, int crawlDepth, long crawlDelay, String baseUrl, ArchivalUnit au) {

    this(crawlDepth, crawlDelay, baseUrl, au);
    m_outputFile = outFile;
  }

  /**
   * RuleTest
   *
   * @param outWriter BufferedWriter
   * @param crawlDepth int
   * @param crawlDelay long
   * @param baseUrl String
   * @param crawlSpec CrawlSpec
   */
  public CrawlRuleTester(
      BufferedWriter outWriter, int crawlDepth, long crawlDelay, String baseUrl, ArchivalUnit au) {
    this(crawlDepth, crawlDelay, baseUrl, au);
    m_outWriter = outWriter;
  }

  /**
   * RuleTest
   *
   * @param msgHandler MessageHandler to take all output
   * @param crawlDepth the crawl depth to use
   * @param crawlDelay the type to wait between fetches
   * @param baseUrl the url to start from
   * @param crawlSpec a CrawlSpec to use for url checking.
   */
  public CrawlRuleTester(
      MessageHandler msgHandler, int crawlDepth, long crawlDelay, String baseUrl, ArchivalUnit au) {
    this(crawlDepth, crawlDelay, baseUrl, au);
    m_msgHandler = msgHandler;
  }

  public void run() {
    try {
      setConfig(ConfigManager.getCurrentConfig());
      if (m_outWriter == null && m_msgHandler == null) {
        useLocalWriter = true;
      } else {
        useLocalWriter = false;
      }
      if (useLocalWriter) {
        openOutputFile();
      }
      checkRules();
      if (useLocalWriter) {
        closeOutputFile();
      }
    } finally {
      if (m_msgHandler != null) {
        m_msgHandler.close();
      }
    }
  }

  void setConfig(Configuration config) {
    log.debug("config: " + config);
    proxyHost = config.get(PARAM_PROXY_HOST);
    proxyPort = config.getInt(PARAM_PROXY_PORT, DEFAULT_PROXY_PORT);
    if (StringUtil.isNullString(proxyHost) || proxyPort <= 0) {
      String http_proxy = System.getenv("http_proxy");
      if (!StringUtil.isNullString(http_proxy)) {
        try {
          HostPortParser hpp = new HostPortParser(http_proxy);
          proxyHost = hpp.getHost();
          proxyPort = hpp.getPort();
        } catch (HostPortParser.InvalidSpec e) {
          log.warning("Can't parse http_proxy environment var, ignoring: " + http_proxy + ": " + e);
        }
      }
    }
    if (StringUtil.isNullString(proxyHost) || proxyPort <= 0) {
      proxyHost = null;
    } else {
      log.info("Proxying through " + proxyHost + ":" + proxyPort);
    }
    userAgent = config.get(PARAM_USER_AGENT);
    if (StringUtil.isNullString(userAgent)) {
      userAgent = null;
    } else {
      log.debug("Setting User-Agent to " + userAgent);
    }
  }

  private void openOutputFile() {
    if (m_outputFile != null) {
      try {
        m_outWriter = new BufferedWriter(new FileWriter(m_outputFile, false));
        return;
      } catch (Exception ex) {
        System.err.println("Error opening output file, writing to stdout: " + ex);
      }
    }
    m_outWriter = new BufferedWriter(new OutputStreamWriter(System.out));
  }

  private void closeOutputFile() {
    try {
      if (m_outWriter != null) {
        m_outWriter.close();
      }
    } catch (IOException ex) {
      System.err.println("Error closing output file.");
    }
  }

  int[] depth_incl;
  int[] depth_fetched;
  int[] depth_parsed;

  private void checkRules() {
    outputMessage("\nChecking " + m_baseUrl, TEST_SUMMARY_MESSAGE);
    outputMessage(
        "crawl depth: " + m_crawlDepth + "     crawl delay: " + m_crawlDelay + " ms.",
        PLAIN_MESSAGE);

    TreeSet crawlList = new TreeSet();
    TreeSet fetched = new TreeSet();
    // inialize with the baseUrl
    crawlList.add(m_baseUrl);
    depth_incl = new int[m_crawlDepth];
    depth_fetched = new int[m_crawlDepth];
    depth_parsed = new int[m_crawlDepth];
    long start_time = TimeBase.nowMs();
    for (int depth = 1; depth <= m_crawlDepth; depth++) {
      if (isInterrupted()) {
        return;
      }
      m_curDepth = depth;
      if (crawlList.isEmpty() && depth <= m_crawlDepth) {
        outputMessage("\nNothing left to crawl, exiting after depth " + (depth - 1), PLAIN_MESSAGE);
        break;
      }
      String[] urls = (String[]) crawlList.toArray(new String[0]);
      crawlList.clear();
      outputMessage("\nDepth " + depth, PLAIN_MESSAGE);
      for (int ix = 0; ix < urls.length; ix++) {
        if (isInterrupted()) {
          return;
        }
        pauseBeforeFetch();
        String urlstr = urls[ix];

        m_incls.clear();
        m_excls.clear();

        // crawl the page
        buildUrlSets(urlstr);
        fetched.add(urlstr);
        // output incl/excl results,
        // add the new_incls to the crawlList for next crawl depth loop
        crawlList.addAll(outputUrlResults(urlstr, m_incls, m_excls));
      }
    }
    long elapsed_time = TimeBase.nowMs() - start_time;
    outputSummary(m_baseUrl, fetched, crawlList, elapsed_time);
  }

  private void buildUrlSets(String url) {

    try {
      outputMessage("\nFetching " + url, TEST_SUMMARY_MESSAGE);
      URL srcUrl = new URL(url);
      //       URLConnection conn = srcUrl.openConnection();
      //       String type = conn.getContentType();
      //       type = conn.getHeaderField("content-type");
      //       InputStream istr = conn.getInputStream();

      LockssUrlConnection conn = UrlUtil.openConnection(url, connectionPool);
      if (proxyHost != null) {
        conn.setProxy(proxyHost, proxyPort);
      }
      if (userAgent != null) {
        conn.setRequestProperty("user-agent", userAgent);
      }
      try {
        conn.execute();
        int resp = conn.getResponseCode();
        if (resp != 200) {
          outputMessage("Resp: " + resp + ": " + conn.getResponseMessage(), TEST_SUMMARY_MESSAGE);
          return;
        }
        depth_fetched[m_curDepth - 1]++;
        String cookies = conn.getResponseHeaderValue("Set-Cookie");
        if (cookies != null) {
          outputMessage("Cookies: " + cookies, PLAIN_MESSAGE);
        }
        String type = conn.getResponseContentType();
        if (type == null || !type.toLowerCase().startsWith("text/html")) {
          outputMessage("Type: " + type + ", not parsing", URL_SUMMARY_MESSAGE);
          return;
        }
        outputMessage("Type: " + type + ", extracting Urls", URL_SUMMARY_MESSAGE);
        InputStream istr = conn.getResponseInputStream();
        InputStreamReader reader = new InputStreamReader(istr);
        //       MyMockCachedUrl mcu = new MyMockCachedUrl(srcUrl.toString(), reader);
        GoslingHtmlLinkExtractor extractor = new GoslingHtmlLinkExtractor();
        extractor.extractUrls(null, istr, null, srcUrl.toString(), new MyLinkExtractorCallback());
        istr.close();
        depth_parsed[m_curDepth - 1]++;
      } finally {
        conn.release();
      }
    } catch (MalformedURLException murle) {
      murle.printStackTrace();
      outputErrResults(url, "Malformed URL:" + murle.getMessage());
    } catch (IOException ex) {
      ex.printStackTrace();
      outputErrResults(url, "IOException: " + ex.getMessage());
    }
  }

  private void pauseBeforeFetch() {
    if (!fetchDeadline.expired()) {
      try {
        fetchDeadline.sleep();
      } catch (InterruptedException ie) {
        // no action
      }
    }
    fetchDeadline.expireIn(m_crawlDelay);
  }

  private void outputMessage(String msg, int msgType) {
    if (isInterrupted()) {
      return;
    }

    if (m_msgHandler != null) {
      m_msgHandler.outputMessage(msg + "\n", msgType);
    } else {
      try {
        m_outWriter.write(msg);
        m_outWriter.newLine();
      } catch (Exception ex) {
        System.err.println(msg);
      }
    }
  }

  private void outputErrResults(String url, String errMsg) {
    outputMessage("Error: " + errMsg + " occured while processing " + url, ERROR_MESSAGE);
  }

  private Set outputUrlResults(String url, Set m_inclset, Set m_exclset) {
    Set new_incls = new TreeSet(CollectionUtils.subtract(m_inclset, m_reported));
    Set new_excls = new TreeSet(CollectionUtils.subtract(m_exclset, m_reported));
    if (!m_inclset.isEmpty()) {
      outputMessage(
          "\nIncluded Urls: ("
              + new_incls.size()
              + " new, "
              + (m_inclset.size() - new_incls.size())
              + " old)",
          URL_SUMMARY_MESSAGE);
      depth_incl[m_curDepth - 1] += new_incls.size();
    }
    for (Iterator it = new_incls.iterator(); it.hasNext(); ) {
      outputMessage(it.next().toString(), PLAIN_MESSAGE);
    }

    if (!m_exclset.isEmpty()) {
      outputMessage(
          "\nExcluded Urls: ("
              + new_excls.size()
              + " new, "
              + (m_exclset.size() - new_excls.size())
              + " old)",
          URL_SUMMARY_MESSAGE);
    }
    for (Iterator it = new_excls.iterator(); it.hasNext(); ) {
      outputMessage(it.next().toString(), PLAIN_MESSAGE);
    }
    m_reported.addAll(new_incls);
    m_reported.addAll(new_excls);

    if (m_outWriter != null) {
      try {
        m_outWriter.flush();
      } catch (IOException ex) {
      }
    }
    return new_incls;
  }

  private void outputSummary(String baseUrl, Set fetched, Set toCrawl, long elapsedTime) {
    int fetchCount = fetched.size();
    outputMessage(
        "\n\nSummary for starting Url: " + baseUrl + " and depth: " + m_crawlDepth,
        TEST_SUMMARY_MESSAGE);
    outputMessage(
        "\nUrls fetched: " + fetchCount + "    Urls extracted: " + m_extracted.size(),
        PLAIN_MESSAGE);

    outputMessage("\nDepth  Fetched  Parsed  New URLs", PLAIN_MESSAGE);
    for (int depth = 1; depth <= m_crawlDepth; depth++) {
      PrintfFormat pf = new PrintfFormat("%5d  %7d  %6d  %8d");
      Integer[] args =
          new Integer[] {
            new Integer(depth),
            new Integer(depth_fetched[depth - 1]),
            new Integer(depth_parsed[depth - 1]),
            new Integer(depth_incl[depth - 1]),
          };
      String s = pf.sprintf(args);
      outputMessage(s, PLAIN_MESSAGE);
    }

    outputMessage("\nRemaining unfetched: " + toCrawl.size(), PLAIN_MESSAGE);
    if (false) {
      for (Iterator iter = toCrawl.iterator(); iter.hasNext(); ) {
        String url = (String) iter.next();
        outputMessage(url, PLAIN_MESSAGE);
      }
    }
    long secs = elapsedTime / Constants.SECOND;
    long fetchRate = 0;
    if (secs > 0) {
      fetchRate = fetchCount * 60 * Constants.SECOND / elapsedTime;
    }
    outputMessage(
        "\nElapsed Time: " + secs + " secs." + "    Fetch Rate: " + fetchRate + " p/m",
        PLAIN_MESSAGE);
  }

  public interface MessageHandler {
    void outputMessage(String message, int messageType);

    void close();
  }

  private class MyLinkExtractorCallback implements LinkExtractor.Callback {

    MyLinkExtractorCallback() {}

    public void foundLink(String url) {

      m_extracted.add(url);
      try {
        String normUrl = UrlUtil.normalizeUrl(url);
        if (BaseCrawler.isSupportedUrlProtocol(normUrl) && m_au.shouldBeCached(normUrl)) {
          m_incls.add(normUrl);
        } else {
          m_excls.add(normUrl);
        }
      } catch (MalformedURLException e) {
        m_excls.add(url);
      }
    }
  }

  class MyMockCachedUrl implements CachedUrl {
    private String url;
    private boolean doesExist = false;
    private Reader reader = null;

    public MyMockCachedUrl(String url, Reader reader) {
      this.url = url;

      this.reader = reader;
    }

    public ArchivalUnit getArchivalUnit() {
      throw new UnsupportedOperationException("Not implemented");
    }

    public String getUrl() {
      return url;
    }

    public CachedUrl getCuVersion(int version) {
      throw new UnsupportedOperationException("Not implemented");
    }

    public CachedUrl[] getCuVersions() {
      throw new UnsupportedOperationException("Not implemented");
    }

    public CachedUrl[] getCuVersions(int maxVersions) {
      throw new UnsupportedOperationException("Not implemented");
    }

    public int getVersion() {
      return 1;
    }

    public Reader openForReading() {
      return reader;
    }

    public LinkRewriterFactory getLinkRewriterFactory() {
      throw new UnsupportedOperationException("Not implemented");
    }

    public String getEncoding() {
      return Constants.DEFAULT_ENCODING;
    }

    /**
     * getUnfilteredInputStream
     *
     * @return InputStream
     */
    public InputStream getUnfilteredInputStream() {
      throw new UnsupportedOperationException("Not implemented");
    }

    /**
     * getUnfilteredInputStream
     *
     * @return InputStream
     */
    public InputStream getUnfilteredInputStream(HashedInputStream.Hasher hasher) {
      throw new UnsupportedOperationException("Not implemented");
    }

    public InputStream getUncompressedInputStream() {
      throw new UnsupportedOperationException("Not implemented");
    }

    public InputStream getUncompressedInputStream(HashedInputStream.Hasher hasher) {
      throw new UnsupportedOperationException("Not implemented");
    }

    /**
     * openForHashing
     *
     * @return InputStream
     */
    public InputStream openForHashing() {
      throw new UnsupportedOperationException("Not implemented");
    }

    /**
     * openForHashing
     *
     * @param hasher HashedInputStream.Hasher for unfiltered content
     * @return InputStream
     */
    public InputStream openForHashing(HashedInputStream.Hasher hasher) {
      throw new UnsupportedOperationException("Not implemented");
    }

    /**
     * getContentSize
     *
     * @return long
     */
    public long getContentSize() {
      throw new UnsupportedOperationException("Not implemented");
    }

    public String getContentType() {
      throw new UnsupportedOperationException("Not implemented");
    }

    public void setOption(String option, String val) {}

    public boolean hasContent() {
      return doesExist;
    }

    public boolean isLeaf() {
      return true;
    }

    public int getType() {
      return CachedUrlSetNode.TYPE_CACHED_URL;
    }

    public CIProperties getProperties() {
      return null;
    }

    public void addProperty(String key, String value) {}

    public void release() {}

    public String toString() {
      StringBuffer sb = new StringBuffer(url.length() + 17);
      sb.append("[MyMockCachedUrl: ");
      sb.append(url);
      sb.append("]");
      return sb.toString();
    }

    @Override
    public FileMetadataExtractor getFileMetadataExtractor(MetadataTarget target) {
      return null;
    }

    public CachedUrl getArchiveMemberCu(ArchiveMemberSpec ams) {
      throw new UnsupportedOperationException("Not implemented");
    }

    @Override
    public boolean isArchiveMember() {
      return false;
    }
  }
}
Example #19
0
 BERec(long when, BackgroundTask task, Schedule.EventType event) {
   this.when = Deadline.at(when);
   this.task = task;
   this.event = event;
 }
Example #20
0
 StepTask task(long start, long end, long duration, TaskCallback cb) {
   return new StepperTask(
       Deadline.at(start), Deadline.at(end), duration, cb, null, new MyMockStepper());
 }
Example #21
0
  /** Explode the archive into its constituent elements */
  public void explode() throws CacheException {
    CachedUrl cachedUrl = null;
    int goodEntries = 0;
    int badEntries = 0;
    int ignoredEntries = 0;
    int entriesBetweenSleep = 0;
    ArchiveReader arcReader = null;

    logger.info(
        (storeArchive ? "Storing" : "Fetching") + " WARC file: " + origUrl + " will explode");
    try {
      if (storeArchive) {
        UrlCacher uc = au.makeUrlCacher(new UrlData(arcStream, arcProps, fetchUrl));
        BitSet bs = new BitSet();
        bs.set(UrlCacher.DONT_CLOSE_INPUT_STREAM_FLAG);
        uc.setFetchFlags(bs);
        uc.storeContent();
        archiveData.resetInputStream();
        arcStream = archiveData.input;
      }
      // Wrap it in an ArchiveReader
      logger.debug3("About to wrap stream");
      arcReader = wrapStream(fetchUrl, arcStream);
      logger.debug3("wrapStream() returns " + (arcReader == null ? "null" : "non-null"));
      // Explode it
      if (arcReader == null) {
        throw new CacheException.ExploderException("no WarcReader for " + origUrl);
      }
      ArchivalUnit au = crawlFacade.getAu();
      Set stemSet = new HashSet();
      logger.debug("Exploding " + fetchUrl);
      // Iterate through the elements in the WARC file, except the first
      Iterator i = arcReader.iterator();
      // Skip first record
      for (i.next(); i.hasNext(); ) {
        // XXX probably not necessary
        helper.pokeWDog();
        if ((++entriesBetweenSleep % sleepAfter) == 0) {
          long pauseTime =
              CurrentConfig.getTimeIntervalParam(PARAM_RETRY_PAUSE, DEFAULT_RETRY_PAUSE);
          Deadline pause = Deadline.in(pauseTime);
          logger.debug3("Sleeping for " + StringUtil.timeIntervalToString(pauseTime));
          while (!pause.expired()) {
            try {
              pause.sleep();
            } catch (InterruptedException ie) {
              // no action
            }
          }
        }
        ArchiveRecord element = (ArchiveRecord) i.next();
        // Each element is a URL to be cached in a suitable AU
        ArchiveRecordHeader elementHeader = element.getHeader();
        String elementUrl = elementHeader.getUrl();
        String elementMimeType = elementHeader.getMimetype();
        long elementLength = elementHeader.getLength();
        logger.debug2("WARC url " + elementUrl + " mime " + elementMimeType);
        if (elementUrl.startsWith("http:")) {
          ArchiveEntry ae =
              new ArchiveEntry(
                  elementUrl,
                  elementLength,
                  0, // XXX need to convert getDate string to long
                  element, // ArchiveRecord extends InputStream
                  this,
                  fetchUrl);
          ae.setHeaderFields(makeCIProperties(elementHeader));
          long bytesStored = elementLength;
          logger.debug3("ArchiveEntry: " + ae.getName() + " bytes " + bytesStored);
          try {
            helper.process(ae);
          } catch (PluginException ex) {
            throw new CacheException.ExploderException("helper.process() threw", ex);
          }
          if (ae.getBaseUrl() != null) {
            if (ae.getRestOfUrl() != null && ae.getHeaderFields() != null) {
              storeEntry(ae);
              handleAddText(ae);
              goodEntries++;
              crawlFacade.getCrawlerStatus().addContentBytesFetched(bytesStored);
            } else {
              ignoredEntries++;
            }
          } else {
            badEntries++;
            logger.debug2("Can't map " + elementUrl + " from " + archiveUrl);
          }
        }
      }
    } catch (IOException ex) {
      throw new CacheException.ExploderException(ex);
    } finally {
      if (arcReader != null)
        try {
          arcReader.close();
          arcReader = null;
        } catch (IOException ex) {
          throw new CacheException.ExploderException(ex);
        }
      if (cachedUrl != null) {
        cachedUrl.release();
      }
      IOUtil.safeClose(arcStream);
    }
    if (badEntries == 0 && goodEntries > 0) {
      // Make it look like a new crawl finished on each AU to which
      // URLs were added.
      for (Iterator it = touchedAus.iterator(); it.hasNext(); ) {
        ArchivalUnit au = (ArchivalUnit) it.next();
        logger.debug3(archiveUrl + " touching " + au.toString());
        AuUtil.getDaemon(au).getNodeManager(au).newContentCrawlFinished();
      }
    } else {
      ArchivalUnit au = crawlFacade.getAu();
      String msg = archiveUrl + ": " + badEntries + "/" + goodEntries + " bad entries";
      throw new CacheException.UnretryableException(msg);
    }
  }