private Record toRecord(SolrInputDocument doc) { Record record = new Record(); for (Entry<String, SolrInputField> entry : doc.entrySet()) { record.getFields().putAll(entry.getKey(), entry.getValue().getValues()); } return record; }
private boolean hasAtLeastOneMimeType(Record record) { if (!record.getFields().containsKey(Fields.ATTACHMENT_MIME_TYPE)) { LOG.debug("Command failed because of missing MIME type for record: {}", record); return false; } return true; }
@Override protected boolean doProcess(Record record) { Timer.Context timerContext = elapsedTime.time(); try { XContentBuilder documentBuilder = jsonBuilder().startObject(); Map<String, Collection<Object>> map = record.getFields().asMap(); for (Map.Entry<String, Collection<Object>> entry : map.entrySet()) { String key = entry.getKey(); Iterator<Object> iterator = entry.getValue().iterator(); while (iterator.hasNext()) { documentBuilder.field(key, iterator.next()); } } documentBuilder.endObject(); loader.addDocument(documentBuilder.bytes(), indexName, indexType, ttl); } catch (Exception e) { throw new MorphlineRuntimeException(e); } finally { timerContext.stop(); } // pass record to next command in chain: return super.doProcess(record); }
@Override protected boolean doProcess(Record record) { if (preserveExisting && record .getFields() .containsKey( Fields.ATTACHMENT_MIME_TYPE)) {; // we must preserve the existing MIME type } else { List attachments = record.get(Fields.ATTACHMENT_BODY); if (attachments.size() > 0) { Object attachment = attachments.get(0); Preconditions.checkNotNull(attachment); InputStream stream; if (attachment instanceof byte[]) { stream = new ByteArrayInputStream((byte[]) attachment); } else { stream = (InputStream) attachment; } Metadata metadata = new Metadata(); // If you specify the resource name (the filename, roughly) with this // parameter, then Tika can use it in guessing the right MIME type String resourceName = (String) record.getFirstValue(Fields.ATTACHMENT_NAME); if (resourceName != null) { metadata.add(Metadata.RESOURCE_NAME_KEY, resourceName); } // Provide stream's charset as hint to Tika for better auto detection String charset = (String) record.getFirstValue(Fields.ATTACHMENT_CHARSET); if (charset != null) { metadata.add(Metadata.CONTENT_ENCODING, charset); } if (includeMetaData) { for (Entry<String, Object> entry : record.getFields().entries()) { metadata.add(entry.getKey(), entry.getValue().toString()); } } String mimeType = getMediaType(stream, metadata, excludeParameters); record.replaceValues(Fields.ATTACHMENT_MIME_TYPE, mimeType); } } return super.doProcess(record); }
@Override protected boolean doProcess(Record record, InputStream inputStream) { Parser parser = detectParser(record); if (parser == null) { return false; } ParseContext parseContext = new ParseContext(); parseContext.set(Locale.class, locale); Metadata metadata = new Metadata(); for (Entry<String, Object> entry : record.getFields().entries()) { metadata.add(entry.getKey(), entry.getValue().toString()); } SolrContentHandler handler = solrContentHandlerFactory.createSolrContentHandler(metadata, solrParams, schema); try { inputStream = TikaInputStream.get(inputStream); ContentHandler parsingHandler = handler; // String xpathExpr = "/xhtml:html/xhtml:body/xhtml:div/descendant:node()"; if (xpathExpr != null) { Matcher matcher = PARSER.parse(xpathExpr); parsingHandler = new MatchingContentHandler(parsingHandler, matcher); } try { parser.parse(inputStream, parsingHandler, metadata, parseContext); } catch (IOException e) { throw new MorphlineRuntimeException("Cannot parse", e); } catch (SAXException e) { throw new MorphlineRuntimeException("Cannot parse", e); } catch (TikaException e) { throw new MorphlineRuntimeException("Cannot parse", e); } } finally { if (inputStream != null) { Closeables.closeQuietly(inputStream); } } SolrInputDocument doc = handler.newDocument(); LOG.debug("solr doc: {}", doc); Record outputRecord = toRecord(doc); return getChild().process(outputRecord); }