Example #1
0
  /**
   * Check commandline parameters and run the indexer for all configured languages.
   *
   * @param args Commandline arguments.
   * @throws IOException Thrown, if target directory is not accessible
   */
  private void runMain(@NotNull final String... args) throws Exception {
    new CmdLineParser(this.cliParams);

    // parser.parseArgument(args);
    parseWithHelp(this.cliParams, args);

    // check, if files and directories are sane
    this.cliParams.check();
    // create target, if it does not exist already
    Files.createDirectories(this.cliParams.dataDir.toPath());

    // setup REST client
    final JestClientFactory factory = new JestClientFactory();
    factory.setHttpClientConfig(new Builder(ES_CONF.URL).multiThreaded(true).build());
    this.client = factory.getObject();

    // languages to index, initially empty
    Collection<Language> runLanguages = Collections.emptyList();

    // decide which languages to index
    if (this.cliParams.onlyLang != null) {
      // create an index for a single language only
      LOG.info("Processing language '{}' only as requested by user.", this.cliParams.onlyLang);
      final Language onlyLang = Language.getByString(this.cliParams.onlyLang);
      if (onlyLang != null) {
        runLanguages = Collections.singletonList(onlyLang);
      } else {
        LOG.error("Unknown language '{}'.", this.cliParams.onlyLang);
      }
    } else {
      // create an index for each known language optionally skipping single ones
      runLanguages = Arrays.asList(Language.values());
      // optionally skip languages
      if (this.cliParams.skipLang.length > 0) {
        LOG.info("Skipping languages {} as requested by user.", this.cliParams.skipLang);
        for (final String skipLang : this.cliParams.skipLang) {
          final Language skipSrcLang = Language.getByString(skipLang);
          if (skipSrcLang != null) {
            runLanguages.remove(skipSrcLang);
          }
        }
      }
    }

    // run index for each specified language
    for (final Language lng : runLanguages) {
      try {
        indexByLanguage(lng);
      } catch (final Exception e) {
        LOG.error("Indexing failed. lang={}", lng, e);
        throw e;
      }
    }

    // close connection
    this.client.shutdownClient();
  }
Example #2
0
  /**
   * Index the documents for a specific language.
   *
   * @param lang Language to index.
   * @throws Exception Thrown on REST request errors
   */
  @SuppressWarnings({"ObjectAllocationInLoop", "BusyWait"})
  private void indexByLanguage(@NotNull final Language lang) throws Exception {
    LOG.info("Creating index for: {}", lang);

    // claim by language
    final String fld_claim = ES_CONF.FLD_CLAIM_PREFIX + lang.toUpperCaseString();

    // get all claims from patents in the specific language including
    // detailed technical description, if available
    @SuppressWarnings("HardcodedLineSeparator")
    final String query =
        "{\n"
            +
            // match all documents - filter later on
            "  \"query\": {\n"
            + "    \"match_all\": {}\n"
            + "  },\n"
            +

            // fields to return
            "  \"fields\": [\n"
            +
            // f: claims
            "    \""
            + fld_claim
            + "\",\n"
            +
            // f: detailed description
            "    \""
            + ES_CONF.FLD_DESC
            + "\",\n"
            +
            // f: detailed description language
            "    \""
            + ES_CONF.FLD_DESC_LNG
            + "\",\n"
            +
            // f: patent id
            "    \""
            + ES_CONF.FLD_PATREF
            + "\",\n"
            +
            // f: ipc code(s)
            "    \""
            + ES_CONF.FLD_IPC
            + "\"\n"
            + "  ],\n"
            + "  \"filter\": {\n"
            +
            // document requires to have claims or detd in target language
            "    \"or\": [\n"
            +
            //     condition: claims in target language
            "      {\n"
            + "        \"exists\": {\n"
            + "          \"field\": \""
            + fld_claim
            + "\"\n"
            + "        }\n"
            + "      },\n"
            +
            //     condition: detd in target language
            "      {\n"
            + "        \"and\": [\n"
            +
            //         match language
            "          {\n"
            + "            \"term\": {\n"
            + "              \""
            + ES_CONF.FLD_DESC_LNG
            + "\": "
            + "                 \""
            + lang
            + "\"\n"
            + "            }\n"
            + "          },\n"
            +
            //         require field to exist
            "          {\n"
            + "            \"exists\": {\n"
            + "              \"field\": \""
            + ES_CONF.FLD_DESC
            + "\"\n"
            + "            }\n"
            + "          }\n"
            + "        ]\n"
            + "      }\n"
            + "    ]\n"
            + "  }\n"
            + '}';

    // setup the search using scan & scroll
    final Search search =
        new Search.Builder(query)
            // index to query
            .addIndex(ES_CONF.INDEX)
            // document type to retrieve
            .addType(ES_CONF.DOC_TYPE)
            // FIXME: using SearchType.SCAN does not work. ES instance requires
            // parameter to be a lower-cased string.
            .setParameter(Parameters.SEARCH_TYPE, "scan") // SearchType.SCAN)
            // hits per shard, each scroll
            .setParameter(Parameters.SIZE, ES_CONF.PAGE_SIZE)
            // keep scroll open for a specific time
            .setParameter(Parameters.SCROLL, ES_CONF.SCROLL_KEEP)
            .build();

    // initialize the scroll search
    JestResult result = ESUtils.runRequest(this.client, search);

    if (!result.isSucceeded()) {
      LOG.error("Initial request failed. {}", result.getErrorMessage());
    }

    JsonObject resultJson = result.getJsonObject();
    JsonArray hits = resultJson.getAsJsonObject("hits").getAsJsonArray("hits");

    String scrollId = resultJson.get("_scroll_id").getAsString();
    final String hitsTotal = resultJson.getAsJsonObject("hits").get("total").toString();
    int currentResultSize = hits.size();

    if (LOG.isDebugEnabled()) {
      LOG.debug("{} - hits:{}/{} scroll-id:{}", lang, currentResultSize, hitsTotal, scrollId);
    }

    final Path targetPath =
        new File(this.cliParams.dataDir.getAbsolutePath() + File.separator + lang).toPath();
    Files.createDirectories(targetPath);
    final IndexBuilder iBuilder =
        new IndexBuilder(
            targetPath,
            lang,
            CliCommon.getStopwords(
                lang.toString(), this.cliParams.stopFileFormat, this.cliParams.stopFilePattern));
    indexResults(iBuilder, hits);

    // scroll through pages to gather all results
    int pageNumber = 1; // number of pages requested
    long dataCount = (long) currentResultSize; // number of items retrieved
    int delay; // throttle delay
    do {
      // retry a request if it has timed out
      delay = (1 + this.rand.nextInt(10)) * 100;

      Thread.sleep((long) delay);
      result =
          ESUtils.runRequest(
              this.client, new SearchScroll.Builder(scrollId, ES_CONF.SCROLL_KEEP).build());

      if (result.isSucceeded()) {
        // parse result set
        resultJson = result.getJsonObject();
        hits = resultJson.getAsJsonObject("hits").getAsJsonArray("hits");
        currentResultSize = hits.size();
        scrollId = resultJson.get("_scroll_id").getAsString();

        // index results
        indexResults(iBuilder, hits);
        dataCount += (long) currentResultSize;

        LOG.info(
            "{} - hits:{}/{}/{} page:{} scroll-id:{} ~{}",
            lang,
            NumberFormat.getIntegerInstance().format((long) currentResultSize),
            NumberFormat.getIntegerInstance().format(dataCount),
            hitsTotal.isEmpty()
                ? "?"
                : NumberFormat.getIntegerInstance().format(Long.valueOf(hitsTotal)),
            pageNumber++,
            scrollId,
            delay);
      } else {
        LOG.error("Result failed: {}. Trying to proceed.", result.getErrorMessage());
      }
    } while (currentResultSize == ES_CONF.PAGE_SIZE);

    iBuilder.close();
  }