Beispiel #1
0
 public void characters(char[] buffer, int start, int length) {
   if (ref) {
     accumulatorRef.append(buffer, start, length);
   } else {
     accumulator.append(buffer, start, length);
   }
 }
  public void endElement(java.lang.String uri, java.lang.String localName, java.lang.String qName)
      throws SAXException {

    try {
      if (qName.equals("p") || qName.equals("description")) {
        writer.write(getText());
        accumulator.setLength(0);
      }

      if (qName.equals("description")) {
        counting = false;
      }

      if (!counting) {
        writer.write(getText());
        accumulator.setLength(0);
        writer.write("</" + qName + ">\n");
      } else {
        if (qName.equals("row")) {
          accumulator.append(" ");
        }
        if (qName.equals("p")) {
          writer.write("\n");
          accumulator.append(" ");
        }
      }
    } catch (Exception e) {
      //		    e.printStackTrace();
      throw new GrobidException("An exception occured while running Grobid.", e);
    }
  }
  private void dispatch(final boolean fireOnlyIfMixed) throws SAXException {
    if (fireOnlyIfMixed && buffer.length() == 0) return; // skip it

    Object[] ctx = (Object[]) context.peek();
    String here = (String) ctx[0];
    Attributes attrs = (Attributes) ctx[1];
    buffer.delete(0, buffer.length());
  }
Beispiel #4
0
  /**
   * Handle the declaration of a Notation in a DTD
   *
   * @param name name of the notation
   * @param publicID the public ID of the notation
   * @param systemID the system ID of the notation
   */
  public void notationDecl(String name, String publicID, String systemID) throws SAXException {

    if (!inInternalSubset) return;

    internalSubset.append("  <!NOTATION ").append(name);
    appendExternalId(publicID, systemID);
    internalSubset.append(">\n");
  }
Beispiel #5
0
  /**
   * Handler for unparsed entity declarations in the DTD
   *
   * @param name <code>String</code> of the unparsed entity decl
   * @param publicID <code>String</code> of the unparsed entity decl
   * @param systemID <code>String</code> of the unparsed entity decl
   * @param notationName <code>String</code> of the unparsed entity decl
   */
  public void unparsedEntityDecl(String name, String publicID, String systemID, String notationName)
      throws SAXException {

    if (!inInternalSubset) return;

    internalSubset.append("  <!ENTITY ").append(name);
    appendExternalId(publicID, systemID);
    internalSubset.append(" NDATA ").append(notationName);
    internalSubset.append(">\n");
  }
Beispiel #6
0
  /**
   * This is called when the parser encounters an external entity declaration.
   *
   * @param name entity name
   * @param publicID public id
   * @param systemID system id
   * @throws SAXException when things go wrong
   */
  public void externalEntityDecl(String name, String publicID, String systemID)
      throws SAXException {
    // Store the public and system ids for the name
    externalEntities.put(name, new String[] {publicID, systemID});

    if (!inInternalSubset) return;

    internalSubset.append("  <!ENTITY ").append(name);
    appendExternalId(publicID, systemID);
    internalSubset.append(">\n");
  }
Beispiel #7
0
 /**
  * Appends an external ID to the internal subset buffer. Either publicID or systemID may be null,
  * but not both.
  *
  * @param publicID the public ID
  * @param systemID the system ID
  */
 private void appendExternalId(String publicID, String systemID) {
   if (publicID != null) {
     internalSubset.append(" PUBLIC \"").append(publicID).append('\"');
   }
   if (systemID != null) {
     if (publicID == null) {
       internalSubset.append(" SYSTEM ");
     } else {
       internalSubset.append(' ');
     }
     internalSubset.append('\"').append(systemID).append('\"');
   }
 }
Beispiel #8
0
  /**
   * Handle an internal entity declaration in a DTD.
   *
   * @param name <code>String</code> name of entity
   * @param value <code>String</code> value of the entity
   * @throws SAXException
   */
  public void internalEntityDecl(String name, String value) throws SAXException {

    // Skip entities that come from the external subset
    if (!inInternalSubset) return;

    internalSubset.append("  <!ENTITY ");
    if (name.startsWith("%")) {
      internalSubset.append("% ").append(name.substring(1));
    } else {
      internalSubset.append(name);
    }
    internalSubset.append(" \"").append(value).append("\">\n");
  }
 public String getText() {
   String res = accumulator.toString().trim();
   // res = res.replace("\u00A0", " "); // stdandard NO-BREAK SPACE are viewed
   // as space
   res = res.replaceAll("\\p{javaSpaceChar}", " "); // replace all unicode space separators
   // by a usual SPACE
   return res.trim();
 }
Beispiel #10
0
  /**
   * This handles an attribute declaration in the internal subset.
   *
   * @param eName <code>String</code> element name of attribute
   * @param aName <code>String</code> attribute name
   * @param type <code>String</code> attribute type
   * @param valueDefault <code>String</code> default value of attribute
   * @param value <code>String</code> value of attribute
   * @throws SAXException
   */
  public void attributeDecl(
      String eName, String aName, String type, String valueDefault, String value)
      throws SAXException {

    if (!inInternalSubset) return;

    internalSubset
        .append("  <!ATTLIST ")
        .append(eName)
        .append(' ')
        .append(aName)
        .append(' ')
        .append(type)
        .append(' ');
    if (valueDefault != null) {
      internalSubset.append(valueDefault);
    } else {
      internalSubset.append('\"').append(value).append('\"');
    }
    if ((valueDefault != null) && (valueDefault.equals("#FIXED"))) {
      internalSubset.append(" \"").append(value).append('\"');
    }
    internalSubset.append(">\n");
  }
Beispiel #11
0
  /**
   * This reports that a comments is parsed. If not in the DTD, this comment is added to the current
   * JDOM <code>Element</code>, or the <code>Document</code> itself if at that level.
   *
   * @param ch <code>ch[]</code> array of comment characters.
   * @param start <code>int</code> index to start reading from.
   * @param length <code>int</code> length of data.
   * @throws SAXException
   */
  public void comment(char[] ch, int start, int length) throws SAXException {

    if (suppress) return;

    flushCharacters();

    String commentText = new String(ch, start, length);
    if (inDTD && inInternalSubset && (expand == false)) {
      internalSubset.append("  <!--").append(commentText).append("-->\n");
      return;
    }
    if ((!inDTD) && (!commentText.equals(""))) {
      if (atRoot) {
        factory.addContent(document, factory.comment(commentText));
      } else {
        factory.addContent(getCurrentElement(), factory.comment(commentText));
      }
    }
  }
  public void startElement(String namespaceURI, String localName, String qName, Attributes atts)
      throws SAXException {
    try {
      // we output the remaining text
      if (!counting) {
        writer.write(getText());
        accumulator.setLength(0);
      }
      if (!counting) {
        writer.write("<" + qName);

        int length = atts.getLength();

        // Process each attribute
        for (int i = 0; i < length; i++) {
          // Get names and values for each attribute
          String name = atts.getQName(i);
          String value = atts.getValue(i);

          if ((name != null) && (value != null)) {
            writer.write(" " + name + "=\"" + value + "\"");
          }
        }

        writer.write(">");
      }

      if (qName.equals("description")) {
        offset = 0;
        counting = true;
      } else if (qName.equals("patent-document")) {
        counting = false;
      }
    } catch (Exception e) {
      //		    e.printStackTrace();
      throw new GrobidException("An exception occured while running Grobid.", e);
    }
  }
Beispiel #13
0
  public void endElement(java.lang.String uri, java.lang.String localName, java.lang.String qName)
      throws SAXException {
    if (qName.equals("date")) {
      accumulator.setLength(0);
    } else if (qName.equals("ref") || qName.equals("bibl")) {
      String refString = getRefText();
      refString = refString.replace("\n", " ");
      refString = refString.replace("\t", " ");
      refString = refString.replace("  ", " ");

      if (npl && ref) {
        if (referencesNPL == null) referencesNPL = new ArrayList<String>();
        referencesNPL.add(refString);
        refFound = true;
        if (nplReferences) nbNPLRef++;
      } else if (ref) {
        if (referencesPatent == null) {
          referencesPatent = new HashMap<String, ArrayList<String>>();
        }
        ArrayList<String> refss = referencesPatent.get(currentFileName);

        if (refss == null) {
          refss = new ArrayList<String>();
        }

        refss.add(refString);
        referencesPatent.put(currentFileName, refss);
        refFound = true;
        if (patentReferences) {
          nbPatentRef++;
        }
      }

      if (refFound) {
        // we tokenize the text
        // ArrayList<String> tokens = TextUtilities.segment(refString,
        // "[("+TextUtilities.punctuations);
        // StringTokenizer st = new StringTokenizer(refString, delimiters, true);
        List<String> tokenizations = new ArrayList<String>();
        try {
          // TBD: pass a language object to the tokenize method call
          tokenizations = analyzer.tokenize(refString);
        } catch (Exception e) {
          LOGGER.debug("Tokenization for XML patent document has failed.");
        }

        int i = 0;
        // String token = null;
        // for(String token : tokens) {
        // while (st.hasMoreTokens()) {
        for (String token : tokenizations) {
          // token = st.nextToken().trim();
          if ((token.trim().length() == 0)
              || (token.equals(" "))
              || (token.equals("\t"))
              || (token.equals("\n"))
              || (token.equals("\r"))) {
            continue;
          }
          try {
            accumulatedText.append(token + "\t");
            allContent.append(token + " ");
            if (npl) {
              if (nplReferences) {
                if (i == 0) {
                  // accumulatedText.append("refNPLBegin\n");
                  accumulatedText.append("I-<refNPL>\n");
                } else if (token == null) {
                  // accumulatedText.append("refNPLEnd\n");
                  accumulatedText.append("E-<refNPL>\n");
                } else {
                  accumulatedText.append("<refNPL>\n");
                }
              } else accumulatedText.append("<other>\n");
            } else {
              if (patentReferences) {
                if (i == 0) accumulatedText.append("I-<refPatent>\n");
                else if (token == null) accumulatedText.append("E-<refPatent>\n");
                else accumulatedText.append("<refPatent>\n");
              } else accumulatedText.append("<other>\n");
            }
          } catch (Exception e) {
            //						e.printStackTrace();
            throw new GrobidException("An exception occured while running Grobid.", e);
          }
          i++;
        }
      }
      ref = false;
    } else if (qName.equals("classification-ipcr")) {
      accumulator.setLength(0);
    } else if (qName.equals("classification-symbol")) {
      accumulator.setLength(0);
    } else if (qName.equals("abstract")) {
      accumulator.setLength(0);
    } else if (qName.equals("heading")) {
      accumulator.append(" ");
    } else if (qName.equals("description")) {
      if (refFound) {
        String content = getText();

        // we tokenize the text
        // ArrayList<String> tokens = TextUtilities.segment(content,
        // "[("+TextUtilities.punctuations);
        // StringTokenizer st = new StringTokenizer(content, delimiters, true);
        List<String> tokenizations = new ArrayList<String>();
        try {
          // TBD: pass a language object to the tokenize method call
          tokenizations = analyzer.tokenize(content);
        } catch (Exception e) {
          LOGGER.debug("Tokenization for XML patent document has failed.");
        }

        int i = 0;
        // String token = null;
        // for(String token : tokens) {
        // while (st.hasMoreTokens()) {
        for (String token : tokenizations) {
          // token = st.nextToken().trim();
          if ((token.trim().length() == 0)
              || (token.equals(" "))
              || (token.equals("\t"))
              || (token.equals("\n"))
              || (token.equals("\r"))) {
            continue;
          }
          // we print only a window of N words
          if ((i > N) && (N != -1)) {
            // break;
            token = token.trim();
            if (token.length() > 0) {
              accumulatedText.append(token + "\t" + "<ignore>\n");
              allContent.append(token + " ");
            }
          } else {
            try {
              token = token.trim();
              if (token.length() > 0) {
                accumulatedText.append(token + "\t" + "<other>\n");
                allContent.append(token + " ");
              }
            } catch (Exception e) {
              throw new GrobidException("An exception occured while running Grobid.", e);
            }
          }
          i++;
        }

        accumulator.setLength(0);
        refFound = false;
      }
    } else if (qName.equals("patcit")) {
      // we register the citation, the citation context will be marked in a later stage
      if (citations == null) citations = new ArrayList<String>();
      citations.add(cited_number);
      accumulator.setLength(0);
    } else if (qName.equals("invention-title")) {
      accumulator.setLength(0);
    } else if (qName.equals("applicants")) {
      accumulator.setLength(0);
    } else if (qName.equals("inventors")) {
      accumulator.setLength(0);
    } else if (qName.equals("document-id")) {
      accumulator.setLength(0);
    } else if (qName.equals("legal-status")) {
      accumulator.setLength(0);
    } else if (qName.equals("bibliographic-data")) {
      accumulator.setLength(0);
    } else if (qName.equals("doc-number")) {
      accumulator.setLength(0);
    } else if (qName.equals("country")) {
      accumulator.setLength(0);
    } else if (qName.equals("kind")) {
      accumulator.setLength(0);
    } else if (qName.equals("classification-symbol")) {
      accumulator.setLength(0);
    } else if (qName.equals("classification-ecla")) {
      accumulator.setLength(0);
    } else if (qName.equals("patent-document") || qName.equals("fulltext-document")) {
      String allString = allContent.toString();
      journalsPositions = lexicon.inJournalNames(allString);
      abbrevJournalsPositions = lexicon.inAbbrevJournalNames(allString);
      conferencesPositions = lexicon.inConferenceNames(allString);
      publishersPositions = lexicon.inPublisherNames(allString);
      allContent = null;
      allString = null;
    } else if (qName.equals("row")) {
      accumulator.append(" ");
    } else if (qName.equals("p")) {
      accumulator.append("\n");
    }
  }
Beispiel #14
0
  /**
   * This signifies that the reading of the DTD is complete.
   *
   * @throws SAXException
   */
  public void endDTD() throws SAXException {

    document.getDocType().setInternalSubset(internalSubset.toString());
    inDTD = false;
    inInternalSubset = false;
  }
Beispiel #15
0
  /**
   * Handle an element declaration in a DTD.
   *
   * @param name <code>String</code> name of element
   * @param model <code>String</code> model of the element in DTD syntax
   * @throws SAXException
   */
  public void elementDecl(String name, String model) throws SAXException {
    // Skip elements that come from the external subset
    if (!inInternalSubset) return;

    internalSubset.append("  <!ELEMENT ").append(name).append(' ').append(model).append(">\n");
  }
Beispiel #16
0
 public String getText() {
   // System.out.println(accumulator.toString().trim());
   return accumulator.toString().trim();
 }
Beispiel #17
0
  public void startElement(String namespaceURI, String localName, String qName, Attributes atts)
      throws SAXException {
    if (qName.equals("PAGE")) {
      int length = atts.getLength();
      currentPage++;

      // Process each attribute
      for (int i = 0; i < length; i++) {
        // Get names and values for each attribute
        String name = atts.getQName(i);
        String value = atts.getValue(i);

        if ((name != null) && (value != null)) {
          if (name.equals("id")) {;
          } else if (name.equals("number")) {

          } else if (name.equals("width")) {

          } else if (name.equals("height")) {

          }
        }
      }

      /*
       * if (block != null) { blabla.append("\n");
       * tokenizations.add("\n"); block.setText(blabla.toString());
       * block.setNbTokens(nbTokens); doc.addBlock(block); } Block block0
       * = new Block(); block0.setText("@PAGE\n"); block0.setNbTokens(0);
       * doc.addBlock(block0);
       */
      /*
       * block = new Block(); blabla = new StringBuffer(); nbTokens = 0;
       * //blabla.append("\n@block\n"); tokenizations.add("\n");
       */
    } else if (qName.equals("BLOCK")) {
      block = new Block();
      blabla = new StringBuffer();
      nbTokens = 0;
      block.setPage(currentPage);
      // blabla.append("\n@block\n");
    } else if (qName.equals("IMAGE")) {
      int length = atts.getLength();

      // Process each attribute
      for (int i = 0; i < length; i++) {
        // Get names and values for each attribute
        String name = atts.getQName(i);
        String value = atts.getValue(i);

        if ((name != null) && (value != null)) {
          if (name.equals("href")) {
            // if (images == null)
            // images = new ArrayList<String>();
            images.add(value);
          } else if (name.equals("x")) {
            double x = Double.parseDouble(value);
            if (x != currentX) {
              currentX = x;
            }
          } else if (name.equals("y")) {
            double y = Double.parseDouble(value);
            if (y != currentY) {
              currentY = y;
            }
          } else if (name.equals("width")) {
            double width = Double.parseDouble(value);
            if (width != currentWidth) {
              currentWidth = width;
            }
          } else if (name.equals("height")) {
            double height = Double.parseDouble(value);
            if (height != currentHeight) {
              currentHeight = height;
            }
          }
        }
      }

    } else if (qName.equals("TEXT")) {
      int length = atts.getLength();

      // Process each attribute
      for (int i = 0; i < length; i++) {
        // Get names and values for each attribute
        String name = atts.getQName(i);
        String value = atts.getValue(i);

        if ((name != null) && (value != null)) {
          if (name.equals("id")) {

          } else if (name.equals("x")) {

          } else if (name.equals("y")) {

          } else if (name.equals("width")) {

          } else if (name.equals("height")) {

          }
        }
      }
    } else if (qName.equals("TOKEN")) {
      int length = atts.getLength();

      // Process each attribute
      for (int i = 0; i < length; i++) {
        // Get names and values for each attribute
        String name = atts.getQName(i);
        String value = atts.getValue(i);

        if ((name != null) && (value != null)) {
          if (name.equals("id")) {;
          } else if (name.equals("font-name")) {
            if (!value.equals(currentFont)) {
              currentFont = value;
              blabla.append(" ");
            }
          } else if (name.equals("font-size")) {
            double fontSize = Double.parseDouble(value);
            if (fontSize != currentFontSize) {
              currentFontSize = fontSize;

              blabla.append(" ");
            }
          } else if (name.equals("bold")) {
            if (value.equals("yes")) {
              currentBold = true;
            } else {
              currentBold = false;
            }
          } else if (name.equals("italic")) {
            if (value.equals("yes")) {
              currentItalic = true;
            } else {
              currentItalic = false;
            }
          } else if (name.equals("font-color")) {
            if (!value.equals(colorFont)) {
              colorFont = value;
            }
          } else if (name.equals("rotation")) {
            if (value.equals("0")) currentRotation = false;
            else currentRotation = true;
          } else if (name.equals("x")) {
            double x = Double.parseDouble(value);
            if (x != currentX) {
              currentX = x;
            }
          } else if (name.equals("y")) {
            double y = Double.parseDouble(value);
            if (y != currentY) {
              currentY = y;
            }
          } else if (name.equals("base")) {
            double base = Double.parseDouble(value);

          } else if (name.equals("width")) {
            double width = Double.parseDouble(value);
            if (width != currentWidth) {
              currentWidth = width;
            }
          } else if (name.equals("height")) {
            double height = Double.parseDouble(value);
            if (height != currentHeight) {
              currentHeight = height;
            }
          }
        }
      }
    } else if (qName.equals("xi:include")) {
      int length = atts.getLength();

      // Process each attribute
      for (int i = 0; i < length; i++) {
        // Get names and values for each attribute
        String name = atts.getQName(i);
        String value = atts.getValue(i);

        if ((name != null) && (value != null)) {
          if (name.equals("href")) {
            // if (images == null)
            // images = new ArrayList<String>();
            images.add(value);
          }
        }
      }
    }
    // accumulator.setLength(0);
  }
Beispiel #18
0
 public void characters(char[] ch, int start, int length) {
   accumulator.append(ch, start, length);
 }
 public void characters(char[] chars, int start, int len) throws SAXException {
   buffer.append(chars, start, len);
 }
  public String getText() {
    String text = accumulator.toString();
    if (text.trim().length() == 0) {
      return "";
    }
    /*text = text.replace("\n", " ");
    text = text.replace("  ", " ");*/
    if (counting) {
      /*


      StringTokenizer st = new StringTokenizer(text, delimiters, true);
      int count = 0;

      while(st.hasMoreTokens()) {
      	String token = st.nextToken().trim();
      	if (token.length() == 0) {
      		continue;
      	}
      	count++;
      }
      */

      int i = currentPatentIndex;
      int count = text.length();

      while (i < patents.size()) {
        PatentItem currentPatent = patents.get(i);
        if (currentPatent != null) {
          int startOffset = currentPatent.getOffsetBegin();
          int endOffset = currentPatent.getOffsetEnd();

          if ((startOffset >= offset) && (endOffset <= offset + count)) {
            String context = currentPatent.getContext();

            /*System.out.println("OFFSET: " + offset);
            System.out.println("count: " + count);
            System.out.println("startOffset: " + startOffset);
            System.out.println("endOffset: " + endOffset);
            System.out.println("context: " + context);
            System.out.println("text: " + text);*/

            String target = "";
            if (context.charAt(0) == ' ') {
              target = " <ref type=\"patent\">" + context.substring(1, context.length()) + "</ref>";
            } else {
              target = "<ref type=\"patent\">" + context + "</ref>";
            }

            text = text.replace(context, target);
            currentPatentIndex = i;
          }
        }

        i++;
      }

      // i = currentArticleIndex;
      i = 0;
      while (i < articles.size()) {
        BibDataSet currentArticle = articles.get(i);
        if (currentArticle != null) {
          List<Integer> offsets = currentArticle.getOffsets();
          int startOffset = -1;
          int endOffset = -1;
          String context = currentArticle.getRawBib().trim();
          if (offsets.size() > 0) {
            if (offsets.get(0) != null) {
              startOffset = offsets.get(0).intValue();
              /*StringTokenizer stt = new StringTokenizer(context, delimiters, true);
              int count2 = 0;
              while(stt.hasMoreTokens()) {
              	String token2 = stt.nextToken().trim();
              	if (token2.length() == 0) {
              		continue;
              	}
              	count2++;
              }*/
              // endOffset = offsets.get(1).intValue();
              endOffset = startOffset + context.length();
            }
          }

          // if ( (startOffset >= offset) && (endOffset <= offset+count) ) {
          if ((startOffset >= offset)) {
            /*System.out.println("OFFSET: " + offset);
            System.out.println("count: " + count);
            System.out.println("startOffset: " + startOffset);
            System.out.println("endOffset: " + endOffset);
            System.out.println("context: " + context);
            System.out.println("text: " + text);*/

            String target = " <ref type=\"npl\">" + context + "</ref> ";
            text = text.replace(context, target);
            currentArticleIndex = i;
          }
        }

        i++;
      }

      offset += count;
    }

    return text;
  }
 public void characters(char[] buffer, int start, int length) {
   accumulator.append(buffer, start, length);
 }
Beispiel #22
0
  public void startElement(String namespaceURI, String localName, String qName, Attributes atts)
      throws SAXException {
    if (qName.equals("patent-document") || qName.equals("fulltext-document")) {
      nbNPLRef = 0;
      nbPatentRef = 0;
      nbAllRef = 0;
      int length = atts.getLength();

      // Process each attribute
      for (int i = 0; i < length; i++) {
        // Get names and values for each attribute
        String name = atts.getQName(i);
        String value = atts.getValue(i);

        if (name != null) {
          if (name.equals("lang")) {
            // Global_Language_Code = value.toLowerCase();
          }
          if (name.equals("doc-number")) {
            PatentNumber = "EP" + value;
          }
          if (name.equals("kind")) {
            CodeType = value;
          }
          if (name.equals("date")) {
            PublicDate = value;
          }
        }
      }

      CitedPatentNumber = new ArrayList<String>();
      accumulatedText = new StringBuffer();
      allContent = new StringBuffer();
      accumulator.setLength(0);
    } else if (qName.equals("description")) {
      accumulator.setLength(0);
    } else if (qName.equals("ref") || qName.equals("bibl")) {
      int length = atts.getLength();
      nbAllRef++;
      // Process each attribute
      for (int i = 0; i < length; i++) {
        // Get names and values for each attribute
        String name = atts.getQName(i);
        String value = atts.getValue(i);

        if (name != null) {
          if (name.equals("type") || name.equals("typ")) {
            if (value.equals("npl") || value.equals("book") || value.equals("journal")) {
              String content = getText();

              // we output what has been read so far in the description
              // we tokenize the text
              // ArrayList<String> tokens =
              // StringTokenizer st = new StringTokenizer(content, delimiters, true);
              List<String> tokenizations = new ArrayList<String>();
              try {
                // TBD: pass a language object to the tokenize method call
                tokenizations = analyzer.tokenize(content);
              } catch (Exception e) {
                LOGGER.debug("Tokenization for XML patent document has failed.");
              }

              // int nbTokens = st.countTokens();
              int nbTokens = tokenizations.size();
              int j = 0;
              // while (st.hasMoreTokens()) {
              for (String token : tokenizations) {
                // String token = st.nextToken().trim();
                if ((token.trim().length() == 0)
                    || (token.equals(" "))
                    || (token.equals("\t"))
                    || (token.equals("\n"))
                    || (token.equals("\r"))) {
                  continue;
                }

                if ((j > (nbTokens - N) && (N != -1)) || (refFound && (j < N) && (N != -1))) {
                  try {
                    accumulatedText.append(token + "\t" + "<other>\n");
                    allContent.append(token + " ");
                  } catch (Exception e) {
                    //										e.printStackTrace();
                    throw new GrobidException("An exception occured while running Grobid.", e);
                  }
                } else {
                  try {
                    accumulatedText.append(token + "\t" + "<ignore>\n");
                    allContent.append(token + " ");
                  } catch (Exception e) {
                    //										e.printStackTrace();
                    throw new GrobidException("An exception occured while running Grobid.", e);
                  }
                }
                j++;
              }

              accumulator.setLength(0);

              npl = true;
              ref = true;
            } else if (value.equals("patent") || value.equals("pl")) {
              String content = getText();

              // we output what has been read so far in the description
              // we tokenize the text
              // ArrayList<String> tokens =
              //	TextUtilities.segment(content,"[("+TextUtilities.punctuations);
              // StringTokenizer st = new StringTokenizer(content, delimiters, true);
              List<String> tokenizations = new ArrayList<String>();
              try {
                // TBD: pass a language object to the tokenize method call
                tokenizations = analyzer.tokenize(content);
              } catch (Exception e) {
                LOGGER.debug("Tokenization for XML patent document has failed.");
              }

              // int nbTokens = st.countTokens();
              int nbTokens = tokenizations.size();
              int j = 0;
              for (String token : tokenizations) {
                // while (st.hasMoreTokens()) {
                // String token = st.nextToken().trim();
                if ((token.trim().length() == 0)
                    || (token.equals(" "))
                    || (token.equals("\t"))
                    || (token.equals("\n"))
                    || (token.equals("\r"))) {
                  continue;
                }

                if ((j > (nbTokens - N)) | (refFound & (j < N))) {
                  try {
                    accumulatedText.append(token + "\t" + "<other>\n");
                    allContent.append(token + " ");
                  } catch (Exception e) {
                    //										e.printStackTrace();
                    throw new GrobidException("An exception occured while running Grobid.", e);
                  }
                } else {
                  try {
                    accumulatedText.append(token + "\t" + "<ignore>\n");
                    allContent.append(token + " ");
                  } catch (Exception e) {
                    //										e.printStackTrace();
                    throw new GrobidException("An exception occured while running Grobid.", e);
                  }
                }
                j++;
              }

              accumulator.setLength(0);
              npl = false;
              ref = true;
            } else {
              System.out.println("Warning: unknown attribute value for ref or bibl: " + value);
              ref = false;
              npl = false;
            }
          }
        }
      }

      accumulatorRef.setLength(0);
    } else if (qName.equals("claim")) {
      accumulator.setLength(0);
    } else if (qName.equals("invention-title")) {
      accumulator.setLength(0);
    } else if (qName.equals("patcit")) {
      int length = atts.getLength();

      // Process each attribute
      for (int i = 0; i < length; i++) {
        // Get names and values for each attribute
        String name = atts.getQName(i);
        String value = atts.getValue(i);

        if (name != null) {
          if (name.equals("ucid")) {
            cited_number = value;
            // we normally need to normalize a little bit this patent nummer
          }
        }
      }
    }
  }
Beispiel #23
0
  public void endElement(java.lang.String uri, java.lang.String localName, java.lang.String qName)
      throws SAXException {
    // if (!qName.equals("TOKEN") && !qName.equals("BLOCK") &&
    // !qName.equals("TEXT"))
    // System.out.println(qName);

    if (qName.equals("TEXT")) {
      blabla.append("\n");
      LayoutToken token = new LayoutToken();
      token.setText("\n");
      block.addToken(token);
      accumulator.setLength(0);
      tokenizations.add("\n");
    } else if (qName.equals("METADATA")) {
      accumulator.setLength(0);
    } else if (qName.equals("TOKEN")) {
      String tok0 = TextUtilities.clean(getText());

      if (block.getStartToken() == -1) {
        block.setStartToken(tokenizations.size());
      }

      if (tok0.length() > 0) {
        StringTokenizer st = new StringTokenizer(tok0, TextUtilities.fullPunctuations, true);
        boolean diaresis = false;
        boolean accent = false;
        boolean keepLast = false;
        while (st.hasMoreTokens()) {

          diaresis = false;
          accent = false;
          keepLast = false;

          String tok = st.nextToken();
          if (tok.length() > 0) {

            LayoutToken token = new LayoutToken();

            if ((previousToken != null)
                && (tok != null)
                && (previousToken.length() > 0)
                && (tok.length() > 0)
                && blabla.length() > 0) {

              Character leftChar = previousTok.getText().charAt(previousTok.getText().length() - 1);
              Character rightChar = tok.charAt(0);

              ModifierClass leftClass = classifyChar(leftChar);
              ModifierClass rightClass = classifyChar(rightChar);
              ModifierClass modifierClass = ModifierClass.NOT_A_MODIFIER;

              if (leftClass != ModifierClass.NOT_A_MODIFIER
                  || rightClass != ModifierClass.NOT_A_MODIFIER) {
                Character baseChar = null;
                Character modifierChar = null;

                if (leftClass != ModifierClass.NOT_A_MODIFIER) {
                  if (rightClass != ModifierClass.NOT_A_MODIFIER) {
                    // assert false;
                    // keeping characters, but setting class
                    // to not a modifier
                    baseChar = leftChar;
                    modifierChar = rightChar;
                    modifierClass = ModifierClass.NOT_A_MODIFIER;
                  } else {
                    baseChar = rightChar;
                    modifierChar = leftChar;
                    modifierClass = leftClass;
                  }
                } else {
                  baseChar = leftChar;
                  modifierChar = rightChar;
                  modifierClass = rightClass;
                }

                String updatedChar = modifyCharacter(baseChar, modifierChar);

                tokenizations.remove(tokenizations.size() - 1);
                if (tokenizations.size() > 0) {
                  tokenizations.remove(tokenizations.size() - 1);
                }

                blabla.deleteCharAt(blabla.length() - 1);
                if (blabla.length() > 0) {
                  blabla.deleteCharAt(blabla.length() - 1);
                }

                removeLastCharacterIfPresent(previousTok);

                if (updatedChar != null) {
                  blabla.append(updatedChar);
                  previousTok.setText(previousTok.getText() + updatedChar);
                }

                blabla.append(tok.substring(1, tok.length()));
                previousTok.setText(previousTok.getText() + tok.substring(1, tok.length()));
                tokenizations.add(previousTok.getText());

                diaresis =
                    (modifierClass == ModifierClass.DIAERESIS
                        || modifierClass == ModifierClass.NORDIC_RING
                        || modifierClass == ModifierClass.CZECH_CARON
                        || modifierClass == ModifierClass.TILDE
                        || modifierClass == ModifierClass.CEDILLA);

                accent =
                    (modifierClass == ModifierClass.ACUTE_ACCENT
                        || modifierClass == ModifierClass.CIRCUMFLEX
                        || modifierClass == ModifierClass.GRAVE_ACCENT);

                if (rightClass != ModifierClass.NOT_A_MODIFIER) {
                  tok = ""; // resetting current token as it
                  // is a single-item
                }
              }
            }

            if (tok != null) {
              // actually in certain cases, the extracted string under token can be a chunk of text
              // with separators that need to be preserved
              // tok = tok.replace(" ", "");
            }

            if ((!diaresis) && (!accent)) {
              // blabla.append(" ");
              blabla.append(tok);
              token.setText(tok);

              tokenizations.add(tok);
            } else {
              tok = "";
              keepLast = true;
            }

            /*
             * StringTokenizer st0 = new StringTokenizer(tok0,
             * TextUtilities.fullPunctuations, true);
             * while(st0.hasMoreTokens()) { String tok =
             * st0.nextToken(); tokenizations.add(tok); }
             * tokenizations.add(" ");
             */

            /*
             * boolean punct1 = false; boolean punct2 = false;
             * boolean punct3 = false; String content = null; int i
             * = 0; for(; i<TextUtilities.punctuations.length();
             * i++) { if (tok.length() > 0) { if
             * (tok.charAt(tok.length()-1) ==
             * TextUtilities.punctuations.charAt(i)) { punct1 =
             * true; content = tok.substring(0, tok.length()-1); if
             * (tok.length() > 1) { int j = 0; for(;
             * j<TextUtilities.punctuations.length(); j++) { if
             * (tok.charAt(tok.length()-2) ==
             * TextUtilities.punctuations.charAt(j)) { punct3 =
             * true; content = tok.substring(0, tok.length()-2); } }
             * } break; } } } if (tok.length() > 0) { if (
             * (tok.startsWith("(")) && (tok.length() > 1) ) { if
             * ((punct3) && (tok.length() > 2)) content =
             * tok.substring(1, tok.length()-2); else if (punct1)
             * content = tok.substring(1, tok.length()-1); else
             * content = tok.substring(1, tok.length()); punct2 =
             * true; token.setText("("); } else if (
             * (tok.startsWith("[")) && (tok.length() > 1) ) { if
             * ((punct3) && (tok.length() > 2)) content =
             * tok.substring(1, tok.length()-2); else if (punct1)
             * content = tok.substring(1, tok.length()-1); else
             * content = tok.substring(1, tok.length()); punct2 =
             * true; token.setText("["); } else if (
             * (tok.startsWith("\"")) && (tok.length() > 1) ) { if
             * ((punct3) && (tok.length() > 2)) content =
             * tok.substring(1, tok.length()-2); else if (punct1)
             * content = tok.substring(1, tok.length()-1); else
             * content = tok.substring(1, tok.length()); punct2 =
             * true; token.setText("\""); } }
             */
            if (currentRotation) currentFontSize = currentFontSize / 2;

            /*
             * if (punct2) { if (currentFont != null)
             * token.setFont(currentFont.toLowerCase()); else
             * token.setFont("default");
             * token.setItalic(currentItalic);
             * token.setBold(currentBold);
             * token.setRotation(currentRotation);
             * token.setColorFont(colorFont); token.setX(currentX);
             * token.setY(currentY); token.setWidth(currentWidth);
             * token.setHeight(currentHeight);
             * token.setFontSize(currentFontSize);
             * block.addToken(token);
             *
             * token = new LayoutToken(); token.setText(content); }
             * if (punct1) { token.setText(content); if (currentFont
             * != null) token.setFont(currentFont.toLowerCase());
             * else token.setFont("default");
             * token.setItalic(currentItalic);
             * token.setBold(currentBold);
             * token.setRotation(currentRotation);
             * token.setColorFont(colorFont); token.setX(currentX);
             * token.setY(currentY); token.setWidth(currentWidth);
             * token.setHeight(currentHeight);
             * token.setFontSize(currentFontSize);
             * block.addToken(token);
             *
             * if (punct3) { token = new LayoutToken();
             * token.setText(""+tok.charAt(tok.length()-2)); if
             * (currentFont != null)
             * token.setFont(currentFont.toLowerCase()); else
             * token.setFont("default");
             * token.setItalic(currentItalic);
             * token.setBold(currentBold);
             * token.setRotation(currentRotation);
             * token.setColorFont(colorFont); token.setX(currentX);
             * token.setY(currentY); token.setWidth(currentWidth);
             * token.setHeight(currentHeight);
             * token.setFontSize(currentFontSize);
             * block.addToken(token); }
             *
             * token = new LayoutToken();
             * token.setText(""+tok.charAt(tok.length()-1)); }
             */
            if (currentFont != null) token.setFont(currentFont.toLowerCase());
            else token.setFont("default");
            token.setItalic(currentItalic);
            token.setBold(currentBold);
            token.setRotation(currentRotation);
            token.setColorFont(colorFont);
            token.setX(currentX);
            token.setY(currentY);
            token.setWidth(currentWidth);
            token.setHeight(currentHeight);
            token.setFontSize(currentFontSize);

            if (!diaresis && !accent) {
              block.addToken(token);
            }

            if (block.getFont() == null) {
              if (currentFont != null) block.setFont(currentFont.toLowerCase());
              else token.setFont("default");
            }
            if (nbTokens == 0) {
              block.setItalic(currentItalic);
              block.setBold(currentBold);
            }
            if (block.getColorFont() == null) block.setColorFont(colorFont);
            if (block.getX() == 0.0) block.setX(currentX);
            if (block.getY() == 0.0) block.setY(currentY);
            if (block.getWidth() == 0.0) block.setWidth(currentWidth);
            if (block.getHeight() == 0.0) block.setHeight(currentHeight);
            if (block.getFontSize() == 0.0) block.setFontSize(currentFontSize);

            if (!diaresis && !accent) {
              previousToken = tok;
              previousTok = token;
            } else {
              previousToken = previousTok.getText();
            }

            nbTokens++;
            accumulator.setLength(0);
          }
        }
        if (tokenizations.size() > 0) {
          String justBefore = tokenizations.get(tokenizations.size() - 1);
          if (!justBefore.endsWith("-")) {
            tokenizations.add(" ");
            blabla.append(" ");
          }
        }
      }
      block.setEndToken(tokenizations.size());
    } else if (qName.equals("PAGE")) {
      // page marker are usefull to detect headers (same first line(s)
      // appearing on each page)
      if (block != null) {
        blabla.append("\n");
        tokenizations.add("\n");
        block.setText(blabla.toString());
        block.setNbTokens(nbTokens);
        doc.addBlock(block);
      }
      Block block0 = new Block();
      block0.setText("@PAGE\n");
      block0.setNbTokens(0);
      block0.setPage(currentPage);
      doc.addBlock(block0);
      block = new Block();
      block.setPage(currentPage);
      blabla = new StringBuffer();
      nbTokens = 0;
      // blabla.append("\n@block\n");
      tokenizations.add("\n");
    } else if (qName.equals("IMAGE")) {
      if (block != null) {
        blabla.append("\n");
        block.setText(blabla.toString());
        block.setNbTokens(nbTokens);
        doc.addBlock(block);
      }
      block = new Block();
      block.setPage(currentPage);
      blabla = new StringBuffer();
      if (images.size() > 0) {
        blabla.append("@IMAGE " + images.get(images.size() - 1) + "\n");
      }
      block.setText(blabla.toString());
      block.setNbTokens(nbTokens);
      if (block.getX() == 0.0) block.setX(currentX);
      if (block.getY() == 0.0) block.setY(currentY);
      if (block.getWidth() == 0.0) block.setWidth(currentWidth);
      if (block.getHeight() == 0.0) block.setHeight(currentHeight);
      doc.addBlock(block);
      blabla = new StringBuffer();
      nbTokens = 0;
      block = new Block();
      block.setPage(currentPage);
    }
    /*
     * else if (qName.equals("VECTORIALIMAGES")) { if (block != null) {
     * blabla.append("\n"); block.setText(blabla.toString());
     * block.setNbTokens(nbTokens); doc.addBlock(block); } block = new
     * Block(); block.setPage(currentPage); blabla = new StringBuffer();
     * blabla.append("@IMAGE " + "vectorial \n");
     * block.setText(blabla.toString()); block.setNbTokens(nbTokens); if
     * (block.getX() == 0.0) block.setX(currentX); if (block.getY() == 0.0)
     * block.setY(currentY); if (block.getWidth() == 0.0)
     * block.setWidth(currentWidth); if (block.getHeight() == 0.0)
     * block.setHeight(currentHeight); doc.addBlock(block); blabla = new
     * StringBuffer(); nbTokens = 0; block = new Block();
     * block.setPage(currentPage); }
     */
    else if (qName.equals("BLOCK")) {
      blabla.append("\n");
      tokenizations.add("\n");
      block.setText(blabla.toString());
      block.setNbTokens(nbTokens);

      block.setWidth(currentX - block.getX() + currentWidth);
      block.setHeight(currentY - block.getY() + currentHeight);

      doc.addBlock(block);
      // blabla = new StringBuffer();
      nbTokens = 0;
      block = null;
    } else if (qName.equals("xi:include")) {
      if (block != null) {
        blabla.append("\n");
        block.setText(blabla.toString());
        block.setNbTokens(nbTokens);
        doc.addBlock(block);
      }
      block = new Block();
      block.setPage(currentPage);
      blabla = new StringBuffer();
      blabla.append("@IMAGE " + images.get(images.size() - 1) + "\n");
      block.setText(blabla.toString());
      block.setNbTokens(nbTokens);
      doc.addBlock(block);
      blabla = new StringBuffer();
      nbTokens = 0;
      block = new Block();
      block.setPage(currentPage);
    }

    /*
     * else if (qName.equals("DOCUMENT")) {
     * System.out.println(blabla.toString()); }
     */

  }