protected ArticleFiles processUrl(CachedUrl cu, Matcher mat) {
   ArticleFiles af = new ArticleFiles();
   af.setFullTextCu(cu);
   af.setRoleCu(ArticleFiles.ROLE_ABSTRACT, cu);
   // XXX Full text PDF link embedded in page, cannot guess URL
   return af;
 }
  public void testFunctionalFromTarHierarchy() throws Exception {
    log.debug3("in testFromTarHierarchy");
    // load the tarballs
    InputStream file_input = null;
    try {
      file_input = getResourceAsStream(realTARFile_A);
      // UrlCacher uc = au.makeUrlCacher(TAR_A_BASE);
      // uc.storeContent(file_input, tarHeader);
      UrlCacher uc = tarAu.makeUrlCacher(new UrlData(file_input, tarHeader, TAR_A_BASE));
      uc.storeContent();
      IOUtil.safeClose(file_input);

      file_input = getResourceAsStream(realTARFile_B);
      // uc = au.makeUrlCacher(TAR_B_BASE);
      // uc.storeContent(file_input, tarHeader);
      uc = tarAu.makeUrlCacher(new UrlData(file_input, tarHeader, TAR_B_BASE));
      uc.storeContent();
      IOUtil.safeClose(file_input);

    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } finally {
      IOUtil.safeClose(file_input);
    }

    CachedUrlSet cus = tarAu.getAuCachedUrlSet();
    for (CachedUrl cu : cus.getCuIterable()) {
      log.debug3("AU - cu is: " + cu.getUrl());
      cu.release();
    }

    // We need to start from the level of the ArticleMetadataExtractor
    MyListEmitter emitter = new MyListEmitter();
    ArticleMetadataExtractor amEx =
        new ElsevierDeferredArticleMetadataExtractor(ArticleFiles.ROLE_ARTICLE_METADATA);

    Iterator<ArticleFiles> it = tarAu.getArticleIterator(MetadataTarget.Any());
    while (it.hasNext()) {
      ArticleFiles af = it.next();
      log.debug3("Metadata test - articlefiles " + af.toString());
      // CachedUrl cu = af.getFullTextCu();
      CachedUrl cu = af.getRoleCu(ArticleFiles.ROLE_ARTICLE_METADATA);
      log.debug3("metadata cu is " + cu.getUrl());
      // List<ArticleMetadata> mdlist = mle.extract(MetadataTarget.Any(), cu);
      amEx.extract(MetadataTarget.Any(), af, emitter);
      List<ArticleMetadata> returnList = emitter.getAmList();

      assertNotNull(returnList);
      log.debug3("size of returnList is " + returnList.size());
      Iterator<ArticleMetadata> mdIt = returnList.iterator();
      ArticleMetadata mdRecord = null;
      while (mdIt.hasNext()) {
        mdRecord = (ArticleMetadata) mdIt.next();
        validateCompleteMetadataRecord(mdRecord);
      }
    }
  }
  public void testArticleCountAndType() throws Exception {
    int expCount = 28;
    PluginTestUtil.crawlSimAu(sau);
    String pat1 = "branch(\\d+)/(\\d+file\\.html)";
    String rep1 = "aps/journal/v123/n$1/full/$2";
    PluginTestUtil.copyAu(sau, nau, ".*[^.][^p][^d][^f]$", pat1, rep1);
    String pat2 = "branch(\\d+)/(\\d+file\\.pdf)";
    String rep2 = "aps/journal/v123/n$1/pdf/$2";
    PluginTestUtil.copyAu(sau, nau, ".*\\.pdf$", pat2, rep2);

    // Remove some URLs
    int deleted = 0;
    for (Iterator it = nau.getAuCachedUrlSet().contentHashIterator(); it.hasNext(); ) {
      CachedUrlSetNode cusn = (CachedUrlSetNode) it.next();
      if (cusn instanceof CachedUrl) {
        CachedUrl cu = (CachedUrl) cusn;
        String url = cu.getUrl();
        if (url.contains("/journal/")
            && (url.endsWith("1file.html") || url.endsWith("2file.pdf"))) {
          deleteBlock(cu);
          ++deleted;
        }
      }
    }
    assertEquals(8, deleted);

    Iterator<ArticleFiles> it = nau.getArticleIterator();
    int count = 0;
    int countHtmlOnly = 0;
    int countPdfOnly = 0;
    while (it.hasNext()) {
      ArticleFiles af = it.next();
      log.info(af.toString());
      CachedUrl cu = af.getFullTextCu();
      String url = cu.getUrl();
      assertNotNull(cu);
      String contentType = cu.getContentType();
      log.debug("count " + count + " url " + url + " " + contentType);
      count++;
      if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == null) {
        ++countHtmlOnly;
      }
      if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == url) {
        ++countPdfOnly;
      }
    }
    log.debug("Article count is " + count);
    assertEquals(expCount, count);
    assertEquals(4, countHtmlOnly);
    assertEquals(4, countPdfOnly);
  }
    protected ArticleFiles processFullTextPdf(CachedUrl cu, Matcher mat) {
      ArticleFiles af = new ArticleFiles();
      af.setRoleCu(ArticleFiles.ROLE_FULL_TEXT_PDF, cu);

      String abstractUrl = mat.replaceAll("$1aspx");
      CachedUrl abstractCu = cu.getArchivalUnit().makeCachedUrl(abstractUrl);

      if (abstractCu.hasContent()) af.setFullTextCu(abstractCu);
      else af.setFullTextCu(cu);

      log.debug3("returning full text: " + af.getFullTextUrl());

      return af;
    }
  public void testCreateArticleFiles() throws Exception {
    PluginTestUtil.crawlSimAu(sau);

    String pat0 = "001file[.]html";
    String rep0 = "52/1/S1";
    PluginTestUtil.copyAu(sau, au, ".*[.]html$", pat0, rep0);
    String pat1 = "001file[.]pdf";
    String rep1 = "52/1/S1.pdf";
    PluginTestUtil.copyAu(sau, au, ".*[.]pdf$", pat1, rep1);

    String pdfurl = "http://pediatrics.aappublications.org/cgi/reprint/52/1/S1.pdf";
    String url = "http://pediatrics.aappublications.org/cgi/reprint/52/1/S1";

    au.makeCachedUrl(url);
    CachedUrl cu = au.makeCachedUrl(pdfurl);
    assertNotNull(cu);
    SubTreeArticleIterator artIter = createSubTreeIter();
    assertNotNull(artIter);
    ArticleFiles af = artIter.next();
    assertNotNull(af);
    System.out.println("article files::" + af);
    assertEquals(url, af.getRoleCu(ArticleFiles.ROLE_FULL_TEXT_PDF_LANDING_PAGE).getUrl());
    assertEquals(pdfurl, af.getRoleCu(ArticleFiles.ROLE_FULL_TEXT_PDF).getUrl());
  }
  public void testCreateArticleFiles() throws Exception {
    // create urls to store in UrlCacher
    String[] au_urls = {
      BASE_URL + "pc/doifinder/10.1057/9780123456789",
      BASE_URL + "pc/browse/citationExport?doi=10.1057/9780123456789",
      BASE_URL + "pc/doifinder/download/10.1057/9780123456789",
      BASE_URL + "pc/doifinder/download/10.1057/9780123456789.epub",
      BASE_URL + "pc/doifinder/10.1057/9781234567890",
      BASE_URL + "pc/browse/citationExport?doi=10.1057/9781234567890",
      BASE_URL + "pc/doifinder/download/10.1057/9781234567890",
      BASE_URL + "pc/doifinder/download/10.1057/9781234567890.epub",
      BASE_URL + "pc/doifinder/10.1057/9782345678901",
      BASE_URL + "pc/browse/citationExport?doi=10.1057/9782345678901",
      BASE_URL + "pc/doifinder/download/10.1057/9782345678901",
      BASE_URL + "pc/doifinder/download/10.1057/9782345678901.epub"
    };
    /*
    // get cached url content type and properties from simulated contents
    // for UrclCacher.storeContent()
    CachedUrl cuPdf = null;
    CachedUrl cuHtml = null;
    CachedUrl cuEpub = null;
    for (CachedUrl cu : AuUtil.getCuIterable(sau)) {
      if (cuPdf == null
          && cu.getContentType().toLowerCase().startsWith(Constants.MIME_TYPE_PDF)) {
        log.info("pdf contenttype: " + cu.getContentType());
        cuPdf = cu;
      } else if (cuHtml == null
          && cu.getContentType().toLowerCase().startsWith(Constants.MIME_TYPE_HTML)) {
        log.info("html contenttype: " + cu.getContentType());
        cuHtml = cu;
      } else if (cuEpub == null
          && cu.getContentType().toLowerCase().startsWith("application/epub")) {
        log.info("epub contenttype: " + cu.getContentType());
        cuEpub = cu;
      }
    	if (cuPdf != null && cuHtml != null && cuEpub != null) {
    	  break;
    	}
    }
    */
    CachedUrl cu;
    // store content using cached url content type and properties
    for (String url : au_urls) {
      if (url.contains("download") && !url.endsWith(".epub")) {
        storeContent(random_content_stream, pdfHeader, url);
      } else if (url.contains("download")) { // epub
        storeContent(random_content_stream, epubHeader, url);
      } else {
        storeContent(random_content_stream, textHeader, url);
      }
    }

    // book 9780123456789
    ArticleFiles af1 = new ArticleFiles();
    af1.setRoleString(
        ArticleFiles.ROLE_FULL_TEXT_PDF, BASE_URL + "pc/doifinder/download/10.1057/9780123456789");
    af1.setRoleString(
        ArticleFiles.ROLE_ARTICLE_METADATA,
        BASE_URL + "pc/browse/citationExport?doi=10.1057/9780123456789");
    af1.setRoleString(
        ArticleFiles.ROLE_FULL_TEXT_EPUB,
        BASE_URL + "pc/doifinder/download/10.1057/9780123456789.epub");
    // book 9780123456789
    ArticleFiles af2 = new ArticleFiles();
    af1.setRoleString(
        ArticleFiles.ROLE_FULL_TEXT_PDF, BASE_URL + "pc/doifinder/download/10.1057/9781234567890");
    af1.setRoleString(
        ArticleFiles.ROLE_ARTICLE_METADATA,
        BASE_URL + "pc/browse/citationExport?doi=10.1057/9781234567890");
    af1.setRoleString(
        ArticleFiles.ROLE_FULL_TEXT_EPUB,
        BASE_URL + "pc/doifinder/download/10.1057/9781234567890.epub");
    // book 9780123456789
    ArticleFiles af3 = new ArticleFiles();
    af1.setRoleString(
        ArticleFiles.ROLE_FULL_TEXT_PDF, BASE_URL + "pc/doifinder/download/10.1057/9782345678901");
    af1.setRoleString(
        ArticleFiles.ROLE_ARTICLE_METADATA,
        BASE_URL + "pc/browse/citationExport?doi=10.1057/9782345678901");
    af1.setRoleString(
        ArticleFiles.ROLE_FULL_TEXT_EPUB,
        BASE_URL + "pc/doifinder/download/10.1057/9782345678901.epub");

    // key the expected content to the fullTextUrl for the ArticleFiles
    HashMap<String, ArticleFiles> fullUrlToAF = new HashMap<String, ArticleFiles>();
    fullUrlToAF.put(BASE_URL + "pc/doifinder/download/10.1057/9780123456789", af1);
    fullUrlToAF.put(BASE_URL + "pc/doifinder/download/10.1057/9781234567890", af2);
    fullUrlToAF.put(BASE_URL + "pc/doifinder/download/10.1057/9782345678901", af3);

    // get article iterator, get article files and the appropriate urls according
    // to their roles.
    String[] expectedUrls = {
      EXPECTED_FULL_TEXT_URL, EXPECTED_PDF_URL,
    };
    for (SubTreeArticleIterator artIter = createSubTreeIter(); artIter.hasNext(); ) {
      ArticleFiles af = artIter.next();
      String[] actualUrls = {
        af.getFullTextUrl(), af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF),
        // af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF_LANDING_PAGE)
      };
      log.info("actualUrls: " + actualUrls.length);
      for (int i = 0; i < actualUrls.length; i++) {
        log.info("e_url: " + expectedUrls[i]);

        log.info("url: " + actualUrls[i]);
        // assertEquals(expectedUrls[i], actualUrls[i]);
      }
    }
  }
    /*
     * In order to find full text PDF you need to find the citation_pdf_url meta tag in the
     * abstract html pull out the pdf url normalize it (reorder params...) and find the matching
     * cached URL
     */
    protected ArticleFiles processAbstract(CachedUrl absCu, Matcher absMat) {
      NodeList nl = null;
      ArticleFiles af = new ArticleFiles();
      if (absCu != null && absCu.hasContent()) {
        // TEMPORARY: set absCU as default full text CU in case there is
        // no PDF CU with content; the current metadata manager currently
        // uses only the full text CU, but this will change with the new
        // metadata schema that can have multiple CUs for an article.
        af.setFullTextCu(absCu);
        af.setRoleCu(ArticleFiles.ROLE_ABSTRACT, absCu);
        try {
          InputStreamSource is =
              new InputStreamSource(new Stream(absCu.getUnfilteredInputStream()));
          Page pg = new Page(is);
          Lexer lx = new Lexer(pg);
          Parser parser = new Parser(lx);
          Lexer.STRICT_REMARKS = false;
          NodeFilter nf =
              new NodeFilter() {
                public boolean accept(Node node) {
                  if (!(node instanceof MetaTag)) return false;
                  MetaTag meta = (MetaTag) node;
                  if (!"citation_pdf_url".equalsIgnoreCase(meta.getMetaTagName())) return false;
                  return true;
                }
              };
          nl = parser.extractAllNodesThatMatch(nf);
        } catch (ParserException e) {
          log.debug("Unable to parse abstract page html", e);
        } catch (UnsupportedEncodingException e) {
          log.debug("Bad encoding in abstact page html", e);
        } finally {
          absCu.release();
        }
      }
      try {
        if (nl != null) {
          if (nl.size() > 0) {
            // minimally encode URL to prevent URL constructor
            // from stripping trailing spaces
            String pdfUrlStr = ((MetaTag) nl.elementAt(0)).getMetaContent();
            URL pdfUrl = new URL(UrlUtil.minimallyEncodeUrl(pdfUrlStr));
            List<String> paramList = new ArrayList<String>();
            paramList.add("fileType");
            paramList.add("fileId");
            paramList.add("fileName");
            pdfUrl = reArrangeUrlParams(pdfUrl, paramList);

            if (!pdfUrl.getHost().startsWith("www.")) {
              pdfUrl = new URL(pdfUrl.getProtocol(), "www." + pdfUrl.getHost(), pdfUrl.getFile());
            }

            // note: must leave URL encoded because that's how we store URLs
            CachedUrl pdfCu = au.makeCachedUrl(pdfUrl.toString());
            if (pdfCu != null && pdfCu.hasContent()) {
              // replace absCU with pdfCU if exists and has content
              af.setFullTextCu(pdfCu);
              af.setRoleCu(ArticleFiles.ROLE_FULL_TEXT_PDF, pdfCu);
            }
          }
        }
      } catch (MalformedURLException e) {
        log.debug("Badly formatted pdf url link", e);
      } catch (IllegalArgumentException e) {
        log.debug("Badly formatted pdf url link", e);
      }

      return af;
    }