/** * Unpack the markup in the document. This converts markup from the native format (e.g. XML, RTF) * into annotations in GATE format. Uses the markupElementsMap to determine which elements to * convert, and what annotation type names to use. */ public void unpackMarkup(Document doc) throws DocumentFormatException { if (doc == null || doc.getContent() == null) return; setNewLineProperty(doc); // Create paragraph annotations in the specified annotation set int endOffset = doc.getContent().toString().length(); int startOffset = 0; annotateParagraphs(doc, startOffset, endOffset, GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); } // unpackMarkup
/** * Constructs a XmlDocumentHandler object. * * @param aDocument the Gate document that will be processed. * @param aMarkupElementsMap this map contains the elements name that we want to create. * @param anElement2StringMap this map contains the strings that will be added to the text * contained by the key element. * @param anAnnotationSet is the annotation set that will be filled when the document was * processed */ public XmlDocumentHandler( gate.Document aDocument, Map<String, String> aMarkupElementsMap, Map<String, String> anElement2StringMap, AnnotationSet anAnnotationSet) { // init parent super(); // init stack stack = new Stack<CustomObject>(); // this string contains the plain text (the text without markup) tmpDocContent = new StringBuffer(aDocument.getContent().size().intValue()); // colector is used later to transform all custom objects into annotation // objects colector = new LinkedList<CustomObject>(); // the Gate document doc = aDocument; // this map contains the elements name that we want to create // if it's null all the elements from the XML documents will be transformed // into Gate annotation objects markupElementsMap = aMarkupElementsMap; // this map contains the string that we want to insert iside the document // content, when a certain element is found // if the map is null then no string is added element2StringMap = anElement2StringMap; basicAS = anAnnotationSet; customObjectsId = 0; } // XmlDocumentHandler()/
/** * Constructor initialises all the private memeber data * * @param aDocument The gate document that will be processed * @param aMarkupElementsMap The map containing the elements that will transform into annotations * @param anAnnotationSet The annotation set that will contain annotations resulted from the * processing of the gate document */ public HtmlDocumentHandler( gate.Document aDocument, Map<String, String> aMarkupElementsMap, gate.AnnotationSet anAnnotationSet) { // init stack stack = new Stack<CustomObject>(); // this string contains the plain text (the text without markup) tmpDocContent = new StringBuffer(aDocument.getContent().size().intValue()); // colector is used later to transform all custom objects into // annotation objects colector = new LinkedList<CustomObject>(); // the Gate document doc = aDocument; // this map contains the elements name that we want to create // if it's null all the elements from the XML documents will be transformed // into Gate annotation objects markupElementsMap = aMarkupElementsMap; // init an annotation set for this gate document basicAS = anAnnotationSet; customObjectsId = 0; } // HtmlDocumentHandler
/** * Check the new line sequence and set document property. <br> * Possible values are CRLF, LFCR, CR, LF */ protected void setNewLineProperty(Document doc) { String content = doc.getContent().toString(); String newLineType = ""; char ch = ' '; char lastch = ' '; for (int i = 0; i < content.length(); ++i) { ch = content.charAt(i); if (lastch == '\r') { if (ch == '\n') { newLineType = "CRLF"; break; } else { newLineType = "CR"; break; } } if (lastch == '\n') { if (ch == '\r') { newLineType = "LFCR"; break; } else { newLineType = "LF"; break; } } lastch = ch; } // for doc.getFeatures().put(GateConstants.DOCUMENT_NEW_LINE_TYPE, newLineType); } // setNewLineProperty()
/** * Generation of a GATE document from a Behemoth one * * @param key URL of the input doc * @param inputDoc * @return * @throws ResourceInstantiationException * @throws InvalidOffsetException * @throws IOException */ public gate.Document generateGATEDoc(BehemothDocument inputDoc) throws ResourceInstantiationException, InvalidOffsetException, IOException { gate.Document gatedocument = null; // if no text is available (e.g. Tika has not extracted it) // let GATE do the parsing itself from the binary content if (inputDoc.getText() == null) { try { gatedocument = generateGATEDocFromLocalDump(inputDoc); // transfer the text from GATE to Behemoth String textContent = gatedocument.getContent().toString(); inputDoc.setText(textContent); return gatedocument; } catch (Exception e) { LOG.error("Can't generate GATE doc from byte dump", e); } } // if the input document does not have any text -> create a doc with an // empty text String text = inputDoc.getText(); if (inputDoc.getText() == null) text = ""; else text = inputDoc.getText(); gatedocument = Factory.newDocument(text); // then the metadata as document features FeatureMap docFeatures = gatedocument.getFeatures(); String docUrl = inputDoc.getUrl(); if (docUrl != null) docFeatures.put("gate.SourceURL", docUrl); if (inputDoc.getMetadata() != null) { Iterator<Entry<Writable, Writable>> iter = inputDoc.getMetadata().entrySet().iterator(); while (iter.hasNext()) { Entry<Writable, Writable> entry = iter.next(); String skey = entry.getKey().toString().trim(); String svalue = null; if (entry.getValue() != null) svalue = entry.getValue().toString().trim(); docFeatures.put(skey, svalue); } } // finally the annotations as original markups // TODO change the name of the annotation set via config AnnotationSet outputAS = gatedocument.getAnnotations("Original markups"); for (Annotation annot : inputDoc.getAnnotations()) { // add to outputAS as a GATE annotation FeatureMap features = Factory.newFeatureMap(); features.putAll(annot.getFeatures()); outputAS.add(annot.getStart(), annot.getEnd(), annot.getType(), features); } return gatedocument; }
/** * This is a test to see if the GATE document has a valid URL or a valid content. * * @param doc * @throws DocumentFormatException */ protected static boolean hasContentButNoValidUrl(Document doc) throws DocumentFormatException { try { if (doc.getSourceUrl() == null && doc.getContent() != null) { // The doc's url is null but there is a content. return true; } else { doc.getSourceUrl().openConnection(); } } catch (IOException ex1) { // The URL is not null but is not valid. if (doc.getContent() == null) // The document content is also null. There is nothing we can do. throw new DocumentFormatException( "The document doesn't have a" + " valid URL and also no content"); return true; } // End try return false; }
/** Delete '\r' in combination CRLF or LFCR in document content */ private void removeExtraNewLine(Document doc) { String content = doc.getContent().toString(); StringBuffer buff = new StringBuffer(content); char ch = ' '; char lastch = ' '; for (int i = content.length() - 1; i > -1; --i) { ch = content.charAt(i); if (ch == '\n' && lastch == '\r') { buff.deleteCharAt(i + 1); } if (ch == '\r' && lastch == '\n') { buff.deleteCharAt(i); ch = lastch; } lastch = ch; } // for doc.setContent(new DocumentContentImpl(buff.toString())); } // removeExtraNewLine(Document doc)
/** * Checks two documents for equality. * * @param doc1 a document * @param doc2 another document * @return a boolean. */ public static boolean documentsEqual(Document doc1, Document doc2) { message = ""; if (doc1 == null ^ doc2 == null) { message = "Documents not equal: null<>non-null!"; return false; } if (doc1 == null) return true; if (!check(doc1.getContent(), doc2.getContent())) { message = "Document contents different!"; return false; } if (!check(doc1.getAnnotations(), doc2.getAnnotations())) { message = "Documents default AS not equal!"; return false; } if (doc1 instanceof TextualDocument) { if (doc2 instanceof TextualDocument) { if (!check( ((TextualDocument) doc1).getEncoding(), ((TextualDocument) doc2).getEncoding())) { message = "Textual documents with different encodings!"; return false; } } else { message = "Documents not equal: textual<>non-textual!"; return false; } } if (!check(doc1.getFeatures(), doc2.getFeatures())) { message = "Documents features not equal!"; return false; } // needs friend declaration :( // if(!markupAware.equals(doc.markupAware)) return false; if (!check(doc1.getNamedAnnotationSets(), doc2.getNamedAnnotationSets())) { message = "Documents named annots not equal!"; return false; } // if(doc1 instanceof DocumentImpl){ // if(doc2 instanceof DocumentImpl){ // if(! check(((DocumentImpl)doc1).getNextNodeId(), // ((DocumentImpl)doc2).getNextNodeId())){ // message = "Documents next nodeID not equal!"; // return false; // } // if(! check(((DocumentImpl)doc1).getNextAnnotationId(), // ((DocumentImpl)doc2).getNextAnnotationId())){ // message = "Documents next annotationIDs not equal!"; // return false; // } // }else{ // message = "Documents not equal: DocumentImpl<>non-DocumentImpl!"; // return false; // } // } if (!check(doc1.getSourceUrl(), doc2.getSourceUrl())) { message = "Documents sourceURLs not equal!"; return false; } if (!(check(doc1.getSourceUrlStartOffset(), doc2.getSourceUrlStartOffset()) && check(doc1.getSourceUrlEndOffset(), doc2.getSourceUrlEndOffset()))) { message = "Documents sourceURLOffsets not equal!"; return false; } return true; }
// carry out the actual annotations on the given span of text in the // document. protected void annotateText(Document doc, AnnotationSet outputAS, long from, long to) { String text = ""; try { text = doc.getContent().getContent(from, to).toString(); } catch (InvalidOffsetException ex) { throw new GateRuntimeException("Unexpected offset exception, offsets are " + from + "/" + to); } // send the text to the service and get back the response // System.out.println("Annotating text: "+text); // System.out.println("Starting offset is "+from); // NOTE: there is a bug in the TagMe service which causes offset errors // if we use the tweet mode and there are certain patterns in the tweet. // The approach recommended by Francesco Piccinno is to replace those // patterns by spaces. if (getIsTweet()) { logger.debug("Text before cleaning: >>" + text + "<<"); // replace text = text.replaceAll(patternStringRT3, " "); text = text.replaceAll(patternStringRT2, " "); text = text.replaceAll(patternHashTag, " $1"); // now replace the remaining patterns by spaces StringBuilder sb = new StringBuilder(text); Matcher m = patternUrl.matcher(text); while (m.find()) { int start = m.start(); int end = m.end(); sb.replace(start, end, nSpaces(end - start)); } m = patternUser.matcher(text); while (m.find()) { int start = m.start(); int end = m.end(); sb.replace(start, end, nSpaces(end - start)); } text = sb.toString(); logger.debug("Text after cleaning: >>" + text + "<<"); } TagMeAnnotation[] tagmeAnnotations = getTagMeAnnotations(text); for (TagMeAnnotation tagmeAnn : tagmeAnnotations) { if (tagmeAnn.rho >= minrho) { FeatureMap fm = Factory.newFeatureMap(); fm.put("tagMeId", tagmeAnn.id); fm.put("title", tagmeAnn.title); fm.put("rho", tagmeAnn.rho); fm.put("spot", tagmeAnn.spot); fm.put("link_probability", tagmeAnn.link_probability); if (tagmeAnn.title == null) { throw new GateRuntimeException("Odd: got a null title from the TagMe service" + tagmeAnn); } else { fm.put("inst", "http://dbpedia.org/resource/" + recodeForDbp38(tagmeAnn.title)); } try { gate.Utils.addAnn( outputAS, from + tagmeAnn.start, from + tagmeAnn.end, getOutputAnnotationType(), fm); } catch (Exception ex) { System.err.println( "Got an exception in document " + doc.getName() + ": " + ex.getLocalizedMessage()); ex.printStackTrace(System.err); System.err.println("from=" + from + ", to=" + to + " TagMeAnn=" + tagmeAnn); } } } }
/** * This method annotates paragraphs in a GATE document. The investigated text spans beetween start * and end offsets and the paragraph annotations are created in the annotSetName. If annotSetName * is null then they are creted in the default annotation set. * * @param aDoc is the gate document on which the paragraph detection would be performed.If it is * null or its content it's null then the method woul simply return doing nothing. * @param startOffset is the index form the document content from which the paragraph detection * will start * @param endOffset is the offset where the detection will end. * @param annotSetName is the name of the set in which paragraph annotation would be created.The * annotation type created will be "paragraph" */ public void annotateParagraphs(Document aDoc, int startOffset, int endOffset, String annotSetName) throws DocumentFormatException { // Simply return if the document is null or its content if (aDoc == null || aDoc.getContent() == null) return; // Simply return if the start is > than the end if (startOffset > endOffset) return; // Decide where to put the newly detected annotations AnnotationSet annotSet = null; if (annotSetName == null) annotSet = aDoc.getAnnotations(); else annotSet = aDoc.getAnnotations(annotSetName); // Extract the document content String content = aDoc.getContent().toString(); // This is the offset marking the start of a para int startOffsetPara = startOffset; // This marks the ned of a para int endOffsetPara = endOffset; // The initial sate of the FSA int state = 1; // This field marks that a BR entity was read // A BR entity can be NL or NL CR, depending on the operating system (UNIX // or DOS) boolean readBR = false; int index = startOffset; while (index < endOffset) { // Read the current char char ch = content.charAt(index); // Test if a BR entity was read if (ch == '\n') { readBR = true; // If \n is followed by a \r then advance the index in order to read a // BR entity while ((index + 1 < endOffset) && (content.charAt(index + 1) == '\r')) index++; } // End if switch (state) { // It is the initial and also a final state // Stay in state 1 while it reads whitespaces case 1: { // If reads a non whitespace char then move to state 2 and record // the beggining of a paragraph if (!Character.isWhitespace(ch)) { state = 2; startOffsetPara = index; } // End if } break; // It can be also a final state. case 2: { // Stay in state 2 while reading chars != BR entities if (readBR) { // If you find a BR char go to state 3. The possible end of the para // can be index. This will be confirmed by state 3. So, this is why // the end of a para is recorded here. readBR = false; endOffsetPara = index; state = 3; } // End if } break; // It can be also a final state // From state 3 there are only 2 possible ways: (state 2 or state1) // In state 1 it needs to read a BR // For state 2 it nead to read something different then a BR case 3: { if (readBR) { // A BR was read. Go to state 1 readBR = false; state = 1; // Create an annotation type paragraph try { annotSet.add( new Long(startOffsetPara), new Long(endOffsetPara), "paragraph", Factory.newFeatureMap()); } catch (gate.util.InvalidOffsetException ioe) { throw new DocumentFormatException( "Coudn't create a paragraph" + " annotation", ioe); } // End try } else { // Go to state 2 an keep reading chars state = 2; } // End if } break; } // End switch // Prepare to read the next char. index++; } // End while endOffsetPara = index; // Investigate where the finite automata has stoped if (state == 2 || state == 3) { // Create an annotation type paragraph try { annotSet.add( new Long(startOffsetPara), // Create the final annotation using the endOffset new Long(endOffsetPara), "paragraph", Factory.newFeatureMap()); } catch (gate.util.InvalidOffsetException ioe) { throw new DocumentFormatException("Coudn't create a paragraph" + " annotation", ioe); } // End try } // End if } // End annotateParagraphs();