Exemplo n.º 1
0
  public void indexFile(Document doc) throws IOException {
    String data64 = org.elasticsearch.common.Base64.encodeFromFile(doc.getContentFilepath());

    File file = new File(doc.getContentFilepath());
    InputStream fileReader = new FileInputStream(file);
    double bytes = file.length();

    int indexedChars = 1000000;
    Metadata metadata = new Metadata();

    byte[] buffer = new byte[1024 * 8];
    ByteArrayOutputStream bos = new ByteArrayOutputStream();
    int i;
    while (-1 != (i = fileReader.read(buffer))) {
      bos.write(buffer, 0, i);
    }
    byte[] data = bos.toByteArray();
    String parsedContent;
    try {

      // Set the maximum length of strings returned by the parseToString method, -1 sets no limit
      parsedContent =
          tika().parseToString(new BytesStreamInput(data, false), metadata, indexedChars);
    } catch (IOException | TikaException e) {
      e.printStackTrace();
      parsedContent = "";
    }

    XContentBuilder source =
        jsonBuilder()
            .startObject()
            .field("file", data64)
            .field("filename", doc.getRootFileName())
            .field("title", doc.getTitle())
            .field("author", doc.getAuthor())
            .field("created_date", doc.getDateCreated())
            .field("content_type", FilenameUtils.getExtension(doc.getRootFileName()))
            .field("content_length", bytes)
            .field("content", parsedContent)
            .endObject();

    IndexResponse idxResp =
        mClient
            .prepareIndex()
            .setIndex(idxName)
            .setType(idxType)
            .setId(doc.getHash())
            .setSource(source)
            .setRefresh(true)
            .execute()
            .actionGet();
  }