/**
  * Unpack the markup in the document. This converts markup from the native format (e.g. XML, RTF)
  * into annotations in GATE format. Uses the markupElementsMap to determine which elements to
  * convert, and what annotation type names to use.
  */
 public void unpackMarkup(Document doc) throws DocumentFormatException {
   if (doc == null || doc.getContent() == null) return;
   setNewLineProperty(doc);
   // Create paragraph annotations in the specified annotation set
   int endOffset = doc.getContent().toString().length();
   int startOffset = 0;
   annotateParagraphs(doc, startOffset, endOffset, GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
 } // unpackMarkup
Beispiel #2
0
  /**
   * Constructs a XmlDocumentHandler object.
   *
   * @param aDocument the Gate document that will be processed.
   * @param aMarkupElementsMap this map contains the elements name that we want to create.
   * @param anElement2StringMap this map contains the strings that will be added to the text
   *     contained by the key element.
   * @param anAnnotationSet is the annotation set that will be filled when the document was
   *     processed
   */
  public XmlDocumentHandler(
      gate.Document aDocument,
      Map<String, String> aMarkupElementsMap,
      Map<String, String> anElement2StringMap,
      AnnotationSet anAnnotationSet) {
    // init parent
    super();
    // init stack
    stack = new Stack<CustomObject>();

    // this string contains the plain text (the text without markup)
    tmpDocContent = new StringBuffer(aDocument.getContent().size().intValue());

    // colector is used later to transform all custom objects into annotation
    // objects
    colector = new LinkedList<CustomObject>();

    // the Gate document
    doc = aDocument;

    // this map contains the elements name that we want to create
    // if it's null all the elements from the XML documents will be transformed
    // into Gate annotation objects
    markupElementsMap = aMarkupElementsMap;

    // this map contains the string that we want to insert iside the document
    // content, when a certain element is found
    // if the map is null then no string is added
    element2StringMap = anElement2StringMap;

    basicAS = anAnnotationSet;
    customObjectsId = 0;
  } // XmlDocumentHandler()/
Beispiel #3
0
  /**
   * Constructor initialises all the private memeber data
   *
   * @param aDocument The gate document that will be processed
   * @param aMarkupElementsMap The map containing the elements that will transform into annotations
   * @param anAnnotationSet The annotation set that will contain annotations resulted from the
   *     processing of the gate document
   */
  public HtmlDocumentHandler(
      gate.Document aDocument,
      Map<String, String> aMarkupElementsMap,
      gate.AnnotationSet anAnnotationSet) {
    // init stack
    stack = new Stack<CustomObject>();

    // this string contains the plain text (the text without markup)
    tmpDocContent = new StringBuffer(aDocument.getContent().size().intValue());

    // colector is used later to transform all custom objects into
    // annotation objects
    colector = new LinkedList<CustomObject>();

    // the Gate document
    doc = aDocument;

    // this map contains the elements name that we want to create
    // if it's null all the elements from the XML documents will be transformed
    // into Gate annotation objects
    markupElementsMap = aMarkupElementsMap;

    // init an annotation set for this gate document
    basicAS = anAnnotationSet;

    customObjectsId = 0;
  } // HtmlDocumentHandler
  /**
   * Check the new line sequence and set document property. <br>
   * Possible values are CRLF, LFCR, CR, LF
   */
  protected void setNewLineProperty(Document doc) {
    String content = doc.getContent().toString();
    String newLineType = "";

    char ch = ' ';
    char lastch = ' ';
    for (int i = 0; i < content.length(); ++i) {
      ch = content.charAt(i);
      if (lastch == '\r') {
        if (ch == '\n') {
          newLineType = "CRLF";
          break;
        } else {
          newLineType = "CR";
          break;
        }
      }
      if (lastch == '\n') {
        if (ch == '\r') {
          newLineType = "LFCR";
          break;
        } else {
          newLineType = "LF";
          break;
        }
      }
      lastch = ch;
    } // for

    doc.getFeatures().put(GateConstants.DOCUMENT_NEW_LINE_TYPE, newLineType);
  } // setNewLineProperty()
  /**
   * Generation of a GATE document from a Behemoth one
   *
   * @param key URL of the input doc
   * @param inputDoc
   * @return
   * @throws ResourceInstantiationException
   * @throws InvalidOffsetException
   * @throws IOException
   */
  public gate.Document generateGATEDoc(BehemothDocument inputDoc)
      throws ResourceInstantiationException, InvalidOffsetException, IOException {

    gate.Document gatedocument = null;

    // if no text is available (e.g. Tika has not extracted it)
    // let GATE do the parsing itself from the binary content
    if (inputDoc.getText() == null) {
      try {
        gatedocument = generateGATEDocFromLocalDump(inputDoc);

        // transfer the text from GATE to Behemoth
        String textContent = gatedocument.getContent().toString();
        inputDoc.setText(textContent);

        return gatedocument;
      } catch (Exception e) {
        LOG.error("Can't generate GATE doc from byte dump", e);
      }
    }

    // if the input document does not have any text -> create a doc with an
    // empty text

    String text = inputDoc.getText();
    if (inputDoc.getText() == null) text = "";
    else text = inputDoc.getText();

    gatedocument = Factory.newDocument(text);

    // then the metadata as document features
    FeatureMap docFeatures = gatedocument.getFeatures();
    String docUrl = inputDoc.getUrl();
    if (docUrl != null) docFeatures.put("gate.SourceURL", docUrl);
    if (inputDoc.getMetadata() != null) {
      Iterator<Entry<Writable, Writable>> iter = inputDoc.getMetadata().entrySet().iterator();
      while (iter.hasNext()) {
        Entry<Writable, Writable> entry = iter.next();
        String skey = entry.getKey().toString().trim();
        String svalue = null;
        if (entry.getValue() != null) svalue = entry.getValue().toString().trim();
        docFeatures.put(skey, svalue);
      }
    }

    // finally the annotations as original markups
    // TODO change the name of the annotation set via config
    AnnotationSet outputAS = gatedocument.getAnnotations("Original markups");
    for (Annotation annot : inputDoc.getAnnotations()) {
      // add to outputAS as a GATE annotation
      FeatureMap features = Factory.newFeatureMap();
      features.putAll(annot.getFeatures());
      outputAS.add(annot.getStart(), annot.getEnd(), annot.getType(), features);
    }
    return gatedocument;
  }
  /**
   * This is a test to see if the GATE document has a valid URL or a valid content.
   *
   * @param doc
   * @throws DocumentFormatException
   */
  protected static boolean hasContentButNoValidUrl(Document doc) throws DocumentFormatException {
    try {
      if (doc.getSourceUrl() == null && doc.getContent() != null) {
        // The doc's url is null but there is a content.
        return true;
      } else {
        doc.getSourceUrl().openConnection();
      }
    } catch (IOException ex1) {
      // The URL is not null but is not valid.
      if (doc.getContent() == null)
        // The document content is also null. There is nothing we can do.
        throw new DocumentFormatException(
            "The document doesn't have a" + " valid URL and also no content");
      return true;
    } // End try

    return false;
  }
  /** Delete '\r' in combination CRLF or LFCR in document content */
  private void removeExtraNewLine(Document doc) {
    String content = doc.getContent().toString();
    StringBuffer buff = new StringBuffer(content);

    char ch = ' ';
    char lastch = ' ';
    for (int i = content.length() - 1; i > -1; --i) {
      ch = content.charAt(i);
      if (ch == '\n' && lastch == '\r') {
        buff.deleteCharAt(i + 1);
      }
      if (ch == '\r' && lastch == '\n') {
        buff.deleteCharAt(i);
        ch = lastch;
      }
      lastch = ch;
    } // for

    doc.setContent(new DocumentContentImpl(buff.toString()));
  } // removeExtraNewLine(Document doc)
  /**
   * Checks two documents for equality.
   *
   * @param doc1 a document
   * @param doc2 another document
   * @return a boolean.
   */
  public static boolean documentsEqual(Document doc1, Document doc2) {
    message = "";
    if (doc1 == null ^ doc2 == null) {
      message = "Documents not equal: null<>non-null!";
      return false;
    }
    if (doc1 == null) return true;
    if (!check(doc1.getContent(), doc2.getContent())) {
      message = "Document contents different!";
      return false;
    }

    if (!check(doc1.getAnnotations(), doc2.getAnnotations())) {
      message = "Documents default AS not equal!";
      return false;
    }

    if (doc1 instanceof TextualDocument) {
      if (doc2 instanceof TextualDocument) {
        if (!check(
            ((TextualDocument) doc1).getEncoding(), ((TextualDocument) doc2).getEncoding())) {
          message = "Textual documents with different encodings!";
          return false;
        }
      } else {
        message = "Documents not equal: textual<>non-textual!";
        return false;
      }
    }
    if (!check(doc1.getFeatures(), doc2.getFeatures())) {
      message = "Documents features not equal!";
      return false;
    }

    // needs friend declaration :(
    //    if(!markupAware.equals(doc.markupAware)) return false;

    if (!check(doc1.getNamedAnnotationSets(), doc2.getNamedAnnotationSets())) {
      message = "Documents named annots not equal!";
      return false;
    }

    //    if(doc1 instanceof DocumentImpl){
    //      if(doc2 instanceof DocumentImpl){
    //        if(! check(((DocumentImpl)doc1).getNextNodeId(),
    //                   ((DocumentImpl)doc2).getNextNodeId())){
    //          message = "Documents next nodeID not equal!";
    //          return false;
    //        }
    //        if(! check(((DocumentImpl)doc1).getNextAnnotationId(),
    //                   ((DocumentImpl)doc2).getNextAnnotationId())){
    //          message = "Documents next annotationIDs not equal!";
    //          return false;
    //        }
    //      }else{
    //        message = "Documents not equal: DocumentImpl<>non-DocumentImpl!";
    //        return false;
    //      }
    //    }

    if (!check(doc1.getSourceUrl(), doc2.getSourceUrl())) {
      message = "Documents sourceURLs not equal!";
      return false;
    }
    if (!(check(doc1.getSourceUrlStartOffset(), doc2.getSourceUrlStartOffset())
        && check(doc1.getSourceUrlEndOffset(), doc2.getSourceUrlEndOffset()))) {
      message = "Documents sourceURLOffsets not equal!";
      return false;
    }
    return true;
  }
  // carry out the actual annotations on the given span of text in the
  // document.
  protected void annotateText(Document doc, AnnotationSet outputAS, long from, long to) {
    String text = "";
    try {
      text = doc.getContent().getContent(from, to).toString();
    } catch (InvalidOffsetException ex) {
      throw new GateRuntimeException("Unexpected offset exception, offsets are " + from + "/" + to);
    }
    // send the text to the service and get back the response
    // System.out.println("Annotating text: "+text);
    // System.out.println("Starting offset is "+from);

    // NOTE: there is a bug in the TagMe service which causes offset errors
    // if we use the tweet mode and there are certain patterns in the tweet.
    // The approach recommended by Francesco Piccinno is to replace those
    // patterns by spaces.
    if (getIsTweet()) {
      logger.debug("Text before cleaning: >>" + text + "<<");
      // replace
      text = text.replaceAll(patternStringRT3, "    ");
      text = text.replaceAll(patternStringRT2, "   ");
      text = text.replaceAll(patternHashTag, " $1");
      // now replace the remaining patterns by spaces
      StringBuilder sb = new StringBuilder(text);
      Matcher m = patternUrl.matcher(text);
      while (m.find()) {
        int start = m.start();
        int end = m.end();
        sb.replace(start, end, nSpaces(end - start));
      }
      m = patternUser.matcher(text);
      while (m.find()) {
        int start = m.start();
        int end = m.end();
        sb.replace(start, end, nSpaces(end - start));
      }
      text = sb.toString();
      logger.debug("Text after cleaning:  >>" + text + "<<");
    }
    TagMeAnnotation[] tagmeAnnotations = getTagMeAnnotations(text);
    for (TagMeAnnotation tagmeAnn : tagmeAnnotations) {
      if (tagmeAnn.rho >= minrho) {
        FeatureMap fm = Factory.newFeatureMap();
        fm.put("tagMeId", tagmeAnn.id);
        fm.put("title", tagmeAnn.title);
        fm.put("rho", tagmeAnn.rho);
        fm.put("spot", tagmeAnn.spot);
        fm.put("link_probability", tagmeAnn.link_probability);
        if (tagmeAnn.title == null) {
          throw new GateRuntimeException("Odd: got a null title from the TagMe service" + tagmeAnn);
        } else {
          fm.put("inst", "http://dbpedia.org/resource/" + recodeForDbp38(tagmeAnn.title));
        }
        try {
          gate.Utils.addAnn(
              outputAS, from + tagmeAnn.start, from + tagmeAnn.end, getOutputAnnotationType(), fm);
        } catch (Exception ex) {
          System.err.println(
              "Got an exception in document " + doc.getName() + ": " + ex.getLocalizedMessage());
          ex.printStackTrace(System.err);
          System.err.println("from=" + from + ", to=" + to + " TagMeAnn=" + tagmeAnn);
        }
      }
    }
  }
 /**
  * This method annotates paragraphs in a GATE document. The investigated text spans beetween start
  * and end offsets and the paragraph annotations are created in the annotSetName. If annotSetName
  * is null then they are creted in the default annotation set.
  *
  * @param aDoc is the gate document on which the paragraph detection would be performed.If it is
  *     null or its content it's null then the method woul simply return doing nothing.
  * @param startOffset is the index form the document content from which the paragraph detection
  *     will start
  * @param endOffset is the offset where the detection will end.
  * @param annotSetName is the name of the set in which paragraph annotation would be created.The
  *     annotation type created will be "paragraph"
  */
 public void annotateParagraphs(Document aDoc, int startOffset, int endOffset, String annotSetName)
     throws DocumentFormatException {
   // Simply return if the document is null or its content
   if (aDoc == null || aDoc.getContent() == null) return;
   // Simply return if the start is > than the end
   if (startOffset > endOffset) return;
   // Decide where to put the newly detected annotations
   AnnotationSet annotSet = null;
   if (annotSetName == null) annotSet = aDoc.getAnnotations();
   else annotSet = aDoc.getAnnotations(annotSetName);
   // Extract the document content
   String content = aDoc.getContent().toString();
   // This is the offset marking the start of a para
   int startOffsetPara = startOffset;
   // This marks the ned of a para
   int endOffsetPara = endOffset;
   // The initial sate of the FSA
   int state = 1;
   // This field marks that a BR entity was read
   // A BR entity can be NL or NL CR, depending on the operating system (UNIX
   // or DOS)
   boolean readBR = false;
   int index = startOffset;
   while (index < endOffset) {
     // Read the current char
     char ch = content.charAt(index);
     // Test if a BR entity was read
     if (ch == '\n') {
       readBR = true;
       // If \n is followed by a \r then advance the index in order to read a
       // BR entity
       while ((index + 1 < endOffset) && (content.charAt(index + 1) == '\r')) index++;
     } // End if
     switch (state) {
         // It is the initial and also a final state
         // Stay in state 1 while it reads whitespaces
       case 1:
         {
           // If reads a non whitespace char then move to state 2 and record
           // the beggining of a paragraph
           if (!Character.isWhitespace(ch)) {
             state = 2;
             startOffsetPara = index;
           } // End if
         }
         break;
         // It can be also a final state.
       case 2:
         {
           // Stay in state 2 while reading chars != BR entities
           if (readBR) {
             // If you find a BR char go to state 3. The possible end of the para
             // can be index. This will be confirmed by state 3. So, this is why
             // the end of a para is recorded here.
             readBR = false;
             endOffsetPara = index;
             state = 3;
           } // End if
         }
         break;
         // It can be also a final state
         // From state 3 there are only 2 possible ways: (state 2 or state1)
         // In state 1 it needs to read a BR
         // For state 2 it nead to read something different then a BR
       case 3:
         {
           if (readBR) {
             // A BR was read. Go to state 1
             readBR = false;
             state = 1;
             // Create an annotation type paragraph
             try {
               annotSet.add(
                   new Long(startOffsetPara),
                   new Long(endOffsetPara),
                   "paragraph",
                   Factory.newFeatureMap());
             } catch (gate.util.InvalidOffsetException ioe) {
               throw new DocumentFormatException(
                   "Coudn't create a paragraph" + " annotation", ioe);
             } // End try
           } else {
             // Go to state 2 an keep reading chars
             state = 2;
           } // End if
         }
         break;
     } // End switch
     // Prepare to read the next char.
     index++;
   } // End while
   endOffsetPara = index;
   // Investigate where the finite automata has stoped
   if (state == 2 || state == 3) {
     // Create an annotation type paragraph
     try {
       annotSet.add(
           new Long(startOffsetPara),
           // Create the final annotation using the endOffset
           new Long(endOffsetPara),
           "paragraph",
           Factory.newFeatureMap());
     } catch (gate.util.InvalidOffsetException ioe) {
       throw new DocumentFormatException("Coudn't create a paragraph" + " annotation", ioe);
     } // End try
   } // End if
 } // End annotateParagraphs();