public List<String> harvest() { String size = "5000"; // max size we can get. String params = "?q=*&size=" + size + "from=0"; List<String> records = null; HttpURLConnection urlConn = null; String json = null; boolean loop = true; int count = 0; String request = istexApiUrl + "/" + params; while (loop) { records = new ArrayList<String>(); try { URL url = new URL(request); logger.info(request); urlConn = (HttpURLConnection) url.openConnection(); if (urlConn != null) { urlConn.setDoInput(true); urlConn.setRequestMethod("GET"); InputStream in = urlConn.getInputStream(); json = Utilities.convertStreamToString(in); JSONParser jsonParser = new JSONParser(); JSONObject jsonObject = (JSONObject) jsonParser.parse(json); JSONArray hits = (JSONArray) jsonObject.get("hits"); request = (String) jsonObject.get("nextPageURI"); Iterator i = hits.iterator(); while (i.hasNext()) { JSONObject hit = (JSONObject) i.next(); records.add((String) hit.get("id")); } processRecords(records); if (request == null) { loop = false; } } } catch (Exception e) { e.printStackTrace(); } } logger.info(" count :" + count); return records; }
public void processRecords(List<String> records) throws MalformedURLException, ProtocolException, IOException { HttpURLConnection urlConn = null; String request = istexApiUrl; System.out.println(records.size()); for (String id : records) { request = istexApiUrl + "/" + id + "/fulltext/tei"; URL url = new URL(request); urlConn = (HttpURLConnection) url.openConnection(); System.out.println(" id :" + id); if (urlConn != null) { urlConn.setDoInput(true); urlConn.setRequestMethod("GET"); InputStream in = urlConn.getInputStream(); String xml = Utilities.convertStreamToString(in); mm.insertExternalTeiDocument(in, id, "istex", MongoCollectionsInterface.ISTEX_TEIS, date); in.close(); } } }