Example #1
0
  /**
   * @param args
   * @throws IOException
   * @throws ParserConfigurationException
   * @throws SAXException
   */
  public static void main(String[] args)
      throws IOException, ParserConfigurationException, SAXException {
    String requestURL =
        "https://databank.ora.ox.ac.uk/oaipmh?verb=ListRecords&resumptionToken=20121206_TKMGW4A_SWT3NHV";

    // String requestURL =
    // "http://bd2.inesc-id.pt:8080/repox2Eudml/OAIHandler?verb=ListRecords&resumptionToken=1354116062009:ELibM_external:eudml-article2:33753:37054::";
    // String requestURL =
    // "http://bd2.inesc-id.pt:8080/repox2Eudml/OAIHandler?verb=GetRecord&identifier=urn:eudml.eu:ELibM_external:05152756&metadataPrefix=eudml-article2";
    // String requestURL = "C:/Users/Gilberto Pedrosa/Desktop/OAIHandler.xml";
    // FileInputStream fis = new FileInputStream(requestURL);
    // InputStream in = fis;
    logger.debug("requestURL=" + requestURL);
    DocumentBuilderFactory factory;
    factory = DocumentBuilderFactory.newInstance();
    factory.setNamespaceAware(true);
    Thread t = Thread.currentThread();
    DocumentBuilder builder = factory.newDocumentBuilder();
    HashMap<Thread, DocumentBuilder> builderMap = new HashMap<Thread, DocumentBuilder>();
    builderMap.put(t, builder);

    InputStream in;

    URL url = new URL(requestURL);
    HttpURLConnection con;
    int responseCode;
    do {
      con = (HttpURLConnection) url.openConnection();
      con.setConnectTimeout(30000);
      con.setReadTimeout(600000);

      if (con.getAllowUserInteraction()) {
        con.setRequestProperty("User-Agent", "OAIHarvester/2.0");
        con.setRequestProperty("Accept-Encoding", "compress, gzip, identify");
      }
      try {
        responseCode = con.getResponseCode();
        logger.debug("responseCode=" + responseCode);
      } catch (FileNotFoundException e) {
        // assume it's a 503 response
        logger.error(requestURL, e);
        responseCode = HttpURLConnection.HTTP_UNAVAILABLE;
      }

      if (responseCode == HttpURLConnection.HTTP_UNAVAILABLE) {
        long retrySeconds = con.getHeaderFieldInt("Retry-After", -1);
        if (retrySeconds == -1) {
          long now = (new Date()).getTime();
          long retryDate = con.getHeaderFieldDate("Retry-After", now);
          retrySeconds = retryDate - now;
        }
        if (retrySeconds == 0) { // Apparently, it's a bad URL
          throw new FileNotFoundException("Bad URL?");
        }
        logger.warn("Server response: Retry-After=" + retrySeconds);
        if (retrySeconds > 0) {
          try {
            Thread.sleep(retrySeconds * 1000);
          } catch (InterruptedException ex) {
            ex.printStackTrace();
          }
        }
      }
    } while (responseCode == HttpURLConnection.HTTP_UNAVAILABLE);
    String contentEncoding = con.getHeaderField("Content-Encoding");
    logger.debug("contentEncoding=" + contentEncoding);
    if ("compress".equals(contentEncoding)) {
      ZipInputStream zis = new ZipInputStream(con.getInputStream());
      zis.getNextEntry();
      in = zis;
    } else if ("gzip".equals(contentEncoding)) {
      in = new GZIPInputStream(con.getInputStream());
    } else if ("deflate".equals(contentEncoding)) {
      in = new InflaterInputStream(con.getInputStream());
    } else {
      in = con.getInputStream();
    }

    byte[] inputBytes = IOUtils.toByteArray(in);
    InputSource data = new InputSource(new ByteArrayInputStream(inputBytes));

    String xmlString = new String(inputBytes, "UTF-8");
    xmlString = XmlUtil.removeInvalidXMLCharacters(xmlString);

    builder.parse(data);

    System.out.println("data = " + data);
  }
Example #2
0
  /**
   * Performs the OAI request, recovering from typical XML error
   *
   * @author nfreire Nuno Freire / Gilberto Pedrosa
   * @param requestURL
   * @throws IOException
   * @throws ParserConfigurationException
   * @throws SAXException
   * @throws TransformerException
   */
  private void harvest(String requestURL)
      throws IOException, ParserConfigurationException, SAXException, TransformerException {
    this.requestURL = requestURL;
    logger.debug("requestURL=" + requestURL);
    InputStream in;
    URL url = new URL(requestURL);
    HttpURLConnection con;
    int responseCode;
    do {
      con = (HttpURLConnection) url.openConnection();
      con.setConnectTimeout(30000);
      con.setReadTimeout(600000);

      if (con.getAllowUserInteraction()) {
        con.setRequestProperty("User-Agent", "OAIHarvester/2.0");
        con.setRequestProperty("Accept-Encoding", "compress, gzip, identify");
      }
      try {
        responseCode = con.getResponseCode();
        logger.debug("responseCode=" + responseCode);
      } catch (FileNotFoundException e) {
        // assume it's a 503 response
        logger.error(requestURL, e);
        responseCode = HttpURLConnection.HTTP_UNAVAILABLE;
      }

      if (responseCode == HttpURLConnection.HTTP_UNAVAILABLE) {
        long retrySeconds = con.getHeaderFieldInt("Retry-After", -1);
        if (retrySeconds == -1) {
          long now = (new Date()).getTime();
          long retryDate = con.getHeaderFieldDate("Retry-After", now);
          retrySeconds = retryDate - now;
        }
        if (retrySeconds == 0) { // Apparently, it's a bad URL
          throw new FileNotFoundException("Bad URL?");
        }
        logger.warn("Server response: Retry-After=" + retrySeconds);
        if (retrySeconds > 0) {
          try {
            Thread.sleep(retrySeconds * 1000);
          } catch (InterruptedException ex) {
            ex.printStackTrace();
          }
        }
      }
    } while (responseCode == HttpURLConnection.HTTP_UNAVAILABLE);
    String contentEncoding = con.getHeaderField("Content-Encoding");
    logger.debug("contentEncoding=" + contentEncoding);
    if ("compress".equals(contentEncoding)) {
      ZipInputStream zis = new ZipInputStream(con.getInputStream());
      zis.getNextEntry();
      in = zis;
    } else if ("gzip".equals(contentEncoding)) {
      in = new GZIPInputStream(con.getInputStream());
    } else if ("deflate".equals(contentEncoding)) {
      in = new InflaterInputStream(con.getInputStream());
    } else {
      in = con.getInputStream();
    }

    byte[] inputBytes = IOUtils.toByteArray(in);
    InputSource data = new InputSource(new ByteArrayInputStream(inputBytes));

    Thread t = Thread.currentThread();
    DocumentBuilder builder = builderMap.get(t);
    if (builder == null) {
      builder = factory.newDocumentBuilder();
      builderMap.put(t, builder);
    }
    try {
      doc = builder.parse(data);
    } catch (SAXException e) {
      try {
        // Here we can try to recover the xml from known typical problems

        // Recover from invalid characters
        // we assume this is UTF-8...
        String xmlString = new String(inputBytes, "UTF-8");
        xmlString = XmlUtil.removeInvalidXMLCharacters(xmlString);

        data = new InputSource(new ByteArrayInputStream(xmlString.getBytes("UTF-8")));
        doc = builder.parse(data);
      } catch (Exception e2) {
        // the recovered version did not work either. Throw the original exception
        throw e;
      }
    } catch (IOException e3) {
      System.out.println("e = " + e3.getMessage());
    } catch (Exception e4) {
      System.out.println("e = " + e4.getMessage());
    }

    StringTokenizer tokenizer = new StringTokenizer(getSingleString("/*/@xsi:schemaLocation"), " ");
    StringBuffer sb = new StringBuffer();
    while (tokenizer.hasMoreTokens()) {
      if (sb.length() > 0) sb.append(" ");
      sb.append(tokenizer.nextToken());
    }
    this.schemaLocation = sb.toString();
    this.defaultNamespace = getDocument().getDocumentElement().getNamespaceURI();
  }