/** * Check commandline parameters and run the indexer for all configured languages. * * @param args Commandline arguments. * @throws IOException Thrown, if target directory is not accessible */ private void runMain(@NotNull final String... args) throws Exception { new CmdLineParser(this.cliParams); // parser.parseArgument(args); parseWithHelp(this.cliParams, args); // check, if files and directories are sane this.cliParams.check(); // create target, if it does not exist already Files.createDirectories(this.cliParams.dataDir.toPath()); // setup REST client final JestClientFactory factory = new JestClientFactory(); factory.setHttpClientConfig(new Builder(ES_CONF.URL).multiThreaded(true).build()); this.client = factory.getObject(); // languages to index, initially empty Collection<Language> runLanguages = Collections.emptyList(); // decide which languages to index if (this.cliParams.onlyLang != null) { // create an index for a single language only LOG.info("Processing language '{}' only as requested by user.", this.cliParams.onlyLang); final Language onlyLang = Language.getByString(this.cliParams.onlyLang); if (onlyLang != null) { runLanguages = Collections.singletonList(onlyLang); } else { LOG.error("Unknown language '{}'.", this.cliParams.onlyLang); } } else { // create an index for each known language optionally skipping single ones runLanguages = Arrays.asList(Language.values()); // optionally skip languages if (this.cliParams.skipLang.length > 0) { LOG.info("Skipping languages {} as requested by user.", this.cliParams.skipLang); for (final String skipLang : this.cliParams.skipLang) { final Language skipSrcLang = Language.getByString(skipLang); if (skipSrcLang != null) { runLanguages.remove(skipSrcLang); } } } } // run index for each specified language for (final Language lng : runLanguages) { try { indexByLanguage(lng); } catch (final Exception e) { LOG.error("Indexing failed. lang={}", lng, e); throw e; } } // close connection this.client.shutdownClient(); }
/** * Index the documents for a specific language. * * @param lang Language to index. * @throws Exception Thrown on REST request errors */ @SuppressWarnings({"ObjectAllocationInLoop", "BusyWait"}) private void indexByLanguage(@NotNull final Language lang) throws Exception { LOG.info("Creating index for: {}", lang); // claim by language final String fld_claim = ES_CONF.FLD_CLAIM_PREFIX + lang.toUpperCaseString(); // get all claims from patents in the specific language including // detailed technical description, if available @SuppressWarnings("HardcodedLineSeparator") final String query = "{\n" + // match all documents - filter later on " \"query\": {\n" + " \"match_all\": {}\n" + " },\n" + // fields to return " \"fields\": [\n" + // f: claims " \"" + fld_claim + "\",\n" + // f: detailed description " \"" + ES_CONF.FLD_DESC + "\",\n" + // f: detailed description language " \"" + ES_CONF.FLD_DESC_LNG + "\",\n" + // f: patent id " \"" + ES_CONF.FLD_PATREF + "\",\n" + // f: ipc code(s) " \"" + ES_CONF.FLD_IPC + "\"\n" + " ],\n" + " \"filter\": {\n" + // document requires to have claims or detd in target language " \"or\": [\n" + // condition: claims in target language " {\n" + " \"exists\": {\n" + " \"field\": \"" + fld_claim + "\"\n" + " }\n" + " },\n" + // condition: detd in target language " {\n" + " \"and\": [\n" + // match language " {\n" + " \"term\": {\n" + " \"" + ES_CONF.FLD_DESC_LNG + "\": " + " \"" + lang + "\"\n" + " }\n" + " },\n" + // require field to exist " {\n" + " \"exists\": {\n" + " \"field\": \"" + ES_CONF.FLD_DESC + "\"\n" + " }\n" + " }\n" + " ]\n" + " }\n" + " ]\n" + " }\n" + '}'; // setup the search using scan & scroll final Search search = new Search.Builder(query) // index to query .addIndex(ES_CONF.INDEX) // document type to retrieve .addType(ES_CONF.DOC_TYPE) // FIXME: using SearchType.SCAN does not work. ES instance requires // parameter to be a lower-cased string. .setParameter(Parameters.SEARCH_TYPE, "scan") // SearchType.SCAN) // hits per shard, each scroll .setParameter(Parameters.SIZE, ES_CONF.PAGE_SIZE) // keep scroll open for a specific time .setParameter(Parameters.SCROLL, ES_CONF.SCROLL_KEEP) .build(); // initialize the scroll search JestResult result = ESUtils.runRequest(this.client, search); if (!result.isSucceeded()) { LOG.error("Initial request failed. {}", result.getErrorMessage()); } JsonObject resultJson = result.getJsonObject(); JsonArray hits = resultJson.getAsJsonObject("hits").getAsJsonArray("hits"); String scrollId = resultJson.get("_scroll_id").getAsString(); final String hitsTotal = resultJson.getAsJsonObject("hits").get("total").toString(); int currentResultSize = hits.size(); if (LOG.isDebugEnabled()) { LOG.debug("{} - hits:{}/{} scroll-id:{}", lang, currentResultSize, hitsTotal, scrollId); } final Path targetPath = new File(this.cliParams.dataDir.getAbsolutePath() + File.separator + lang).toPath(); Files.createDirectories(targetPath); final IndexBuilder iBuilder = new IndexBuilder( targetPath, lang, CliCommon.getStopwords( lang.toString(), this.cliParams.stopFileFormat, this.cliParams.stopFilePattern)); indexResults(iBuilder, hits); // scroll through pages to gather all results int pageNumber = 1; // number of pages requested long dataCount = (long) currentResultSize; // number of items retrieved int delay; // throttle delay do { // retry a request if it has timed out delay = (1 + this.rand.nextInt(10)) * 100; Thread.sleep((long) delay); result = ESUtils.runRequest( this.client, new SearchScroll.Builder(scrollId, ES_CONF.SCROLL_KEEP).build()); if (result.isSucceeded()) { // parse result set resultJson = result.getJsonObject(); hits = resultJson.getAsJsonObject("hits").getAsJsonArray("hits"); currentResultSize = hits.size(); scrollId = resultJson.get("_scroll_id").getAsString(); // index results indexResults(iBuilder, hits); dataCount += (long) currentResultSize; LOG.info( "{} - hits:{}/{}/{} page:{} scroll-id:{} ~{}", lang, NumberFormat.getIntegerInstance().format((long) currentResultSize), NumberFormat.getIntegerInstance().format(dataCount), hitsTotal.isEmpty() ? "?" : NumberFormat.getIntegerInstance().format(Long.valueOf(hitsTotal)), pageNumber++, scrollId, delay); } else { LOG.error("Result failed: {}. Trying to proceed.", result.getErrorMessage()); } } while (currentResultSize == ES_CONF.PAGE_SIZE); iBuilder.close(); }