예제 #1
0
    /**
     * Returns the content of a JDOM Element detached from it.
     *
     * @param elt the element to get the content from.
     * @return a (possibly empty) list of JDOM nodes, detached from their parent.
     */
    private List getDetachedContent(Element elt) {
      List content = elt.getContent();
      List nodes = new ArrayList(content.size());

      while (content.size() != 0) {
        Object o = content.remove(0);
        nodes.add(o);
      }
      return (nodes);
    }
예제 #2
0
  /**
   * This will output a list of JDOM nodes as a fragment of an XML document, firing off the SAX
   * events that have been registered.
   *
   * <p><strong>Warning</strong>: This method does not call the {@link
   * ContentHandler#setDocumentLocator}, {@link ContentHandler#startDocument} and {@link
   * ContentHandler#endDocument} callbacks on the {@link #setContentHandler ContentHandler}. The
   * user shall invoke these methods directly prior/after outputting the document fragments.
   *
   * @param nodes <code>List</code> of JDOM nodes to output.
   * @throws JDOMException if any error occurred.
   * @see #outputFragment(org.jdom2.Content)
   */
  public void outputFragment(List<? extends Content> nodes) throws JDOMException {
    if ((nodes == null) || (nodes.size() == 0)) {
      return;
    }

    // Output node list as a document fragment.
    elementContent(nodes, new NamespaceStack());
  }
예제 #3
0
  /**
   * Returns the result of an XSL Transformation as a list of JDOM nodes.
   *
   * <p>If the result of the transformation is a JDOM document, this method converts it into a list
   * of JDOM nodes; any subsequent call to {@link #getDocument} will return <code>null</code>.
   *
   * @return the transformation result as a (possibly empty) list of JDOM nodes (Elements, Texts,
   *     Comments, PIs...).
   */
  public List getResult() {
    List nodes = Collections.EMPTY_LIST;

    // Retrieve result from the document builder if not set.
    this.retrieveResult();

    if (result instanceof List) {
      nodes = (List) result;
    } else {
      if ((result instanceof Document) && (queried == false)) {
        List content = ((Document) result).getContent();
        nodes = new ArrayList(content.size());

        while (content.size() != 0) {
          Object o = content.remove(0);
          nodes.add(o);
        }
        result = nodes;
      }
    }
    queried = true;

    return (nodes);
  }
예제 #4
0
  /**
   * This will output a list of JDOM nodes as a document, firing off the SAX events that have been
   * registered.
   *
   * <p><strong>Warning</strong>: This method may output ill-formed XML documents if the list
   * contains top-level objects that are not legal at the document level (e.g. Text or CDATA nodes,
   * multiple Element nodes, etc.). Thus, it should only be used to output document portions towards
   * ContentHandlers capable of accepting such ill-formed documents (such as XSLT processors).
   *
   * @param nodes <code>List</code> of JDOM nodes to output.
   * @throws JDOMException if any error occurred.
   * @see #output(org.jdom2.Document)
   */
  public void output(List<? extends Content> nodes) throws JDOMException {
    if ((nodes == null) || (nodes.size() == 0)) {
      return;
    }

    // contentHandler.setDocumentLocator()
    documentLocator(null);

    // contentHandler.startDocument()
    startDocument();

    // Process node list.
    elementContent(nodes, new NamespaceStack());

    // contentHandler.endDocument()
    endDocument();
  }
예제 #5
0
  /**
   * This reports the occurrence of an actual element. It will include the element's attributes,
   * with the exception of XML vocabulary specific attributes, such as <code>
   * xmlns:[namespace prefix]</code> and <code>xsi:schemaLocation</code>.
   *
   * @param namespaceURI <code>String</code> namespace URI this element is associated with, or an
   *     empty <code>String</code>
   * @param localName <code>String</code> name of element (with no namespace prefix, if one is
   *     present)
   * @param qName <code>String</code> XML 1.0 version of element name: [namespace
   *     prefix]:[localName]
   * @param atts <code>Attributes</code> list for this element
   * @throws SAXException when things go wrong
   */
  public void startElement(String namespaceURI, String localName, String qName, Attributes atts)
      throws SAXException {
    if (suppress) return;

    Element element = null;

    if ((namespaceURI != null) && (!namespaceURI.equals(""))) {
      String prefix = "";

      // Determine any prefix on the Element
      if (!qName.equals(localName)) {
        int split = qName.indexOf(":");
        prefix = qName.substring(0, split);
      }
      Namespace elementNamespace = Namespace.getNamespace(prefix, namespaceURI);
      element = factory.element(localName, elementNamespace);
    } else {
      element = factory.element(localName);
    }

    // Take leftover declared namespaces and add them to this element's
    // map of namespaces
    if (declaredNamespaces.size() > 0) {
      transferNamespaces(element);
    }

    // Handle attributes
    for (int i = 0, len = atts.getLength(); i < len; i++) {
      Attribute attribute = null;

      String attLocalName = atts.getLocalName(i);
      String attQName = atts.getQName(i);
      int attType = getAttributeType(atts.getType(i));

      // Bypass any xmlns attributes which might appear, as we got
      // them already in startPrefixMapping().
      // This is sometimes necessary when SAXHandler is used with
      // another source than SAXBuilder, as with JDOMResult.
      if (attQName.startsWith("xmlns:") || attQName.equals("xmlns")) {
        continue;
      }

      // First clause per http://markmail.org/message/2p245ggcjst27xe6
      // patch from Mattias Jiderhamn
      if ("".equals(attLocalName) && attQName.indexOf(":") == -1) {
        attribute = factory.attribute(attQName, atts.getValue(i), attType);
      } else if (!attQName.equals(attLocalName)) {
        String attPrefix = attQName.substring(0, attQName.indexOf(":"));
        Namespace attNs = Namespace.getNamespace(attPrefix, atts.getURI(i));

        attribute = factory.attribute(attLocalName, atts.getValue(i), attType, attNs);
      } else {
        attribute = factory.attribute(attLocalName, atts.getValue(i), attType);
      }
      factory.setAttribute(element, attribute);
    }

    flushCharacters();

    if (atRoot) {
      document.setRootElement(element); // XXX should we use a factory call?
      atRoot = false;
    } else {
      factory.addContent(getCurrentElement(), element);
    }
    currentElement = element;
  }
예제 #6
0
  public void startElement(String namespaceURI, String localName, String qName, Attributes atts)
      throws SAXException {
    if (qName.equals("patent-document") || qName.equals("fulltext-document")) {
      nbNPLRef = 0;
      nbPatentRef = 0;
      nbAllRef = 0;
      int length = atts.getLength();

      // Process each attribute
      for (int i = 0; i < length; i++) {
        // Get names and values for each attribute
        String name = atts.getQName(i);
        String value = atts.getValue(i);

        if (name != null) {
          if (name.equals("lang")) {
            // Global_Language_Code = value.toLowerCase();
          }
          if (name.equals("doc-number")) {
            PatentNumber = "EP" + value;
          }
          if (name.equals("kind")) {
            CodeType = value;
          }
          if (name.equals("date")) {
            PublicDate = value;
          }
        }
      }

      CitedPatentNumber = new ArrayList<String>();
      accumulatedText = new StringBuffer();
      allContent = new StringBuffer();
      accumulator.setLength(0);
    } else if (qName.equals("description")) {
      accumulator.setLength(0);
    } else if (qName.equals("ref") || qName.equals("bibl")) {
      int length = atts.getLength();
      nbAllRef++;
      // Process each attribute
      for (int i = 0; i < length; i++) {
        // Get names and values for each attribute
        String name = atts.getQName(i);
        String value = atts.getValue(i);

        if (name != null) {
          if (name.equals("type") || name.equals("typ")) {
            if (value.equals("npl") || value.equals("book") || value.equals("journal")) {
              String content = getText();

              // we output what has been read so far in the description
              // we tokenize the text
              // ArrayList<String> tokens =
              // StringTokenizer st = new StringTokenizer(content, delimiters, true);
              List<String> tokenizations = new ArrayList<String>();
              try {
                // TBD: pass a language object to the tokenize method call
                tokenizations = analyzer.tokenize(content);
              } catch (Exception e) {
                LOGGER.debug("Tokenization for XML patent document has failed.");
              }

              // int nbTokens = st.countTokens();
              int nbTokens = tokenizations.size();
              int j = 0;
              // while (st.hasMoreTokens()) {
              for (String token : tokenizations) {
                // String token = st.nextToken().trim();
                if ((token.trim().length() == 0)
                    || (token.equals(" "))
                    || (token.equals("\t"))
                    || (token.equals("\n"))
                    || (token.equals("\r"))) {
                  continue;
                }

                if ((j > (nbTokens - N) && (N != -1)) || (refFound && (j < N) && (N != -1))) {
                  try {
                    accumulatedText.append(token + "\t" + "<other>\n");
                    allContent.append(token + " ");
                  } catch (Exception e) {
                    //										e.printStackTrace();
                    throw new GrobidException("An exception occured while running Grobid.", e);
                  }
                } else {
                  try {
                    accumulatedText.append(token + "\t" + "<ignore>\n");
                    allContent.append(token + " ");
                  } catch (Exception e) {
                    //										e.printStackTrace();
                    throw new GrobidException("An exception occured while running Grobid.", e);
                  }
                }
                j++;
              }

              accumulator.setLength(0);

              npl = true;
              ref = true;
            } else if (value.equals("patent") || value.equals("pl")) {
              String content = getText();

              // we output what has been read so far in the description
              // we tokenize the text
              // ArrayList<String> tokens =
              //	TextUtilities.segment(content,"[("+TextUtilities.punctuations);
              // StringTokenizer st = new StringTokenizer(content, delimiters, true);
              List<String> tokenizations = new ArrayList<String>();
              try {
                // TBD: pass a language object to the tokenize method call
                tokenizations = analyzer.tokenize(content);
              } catch (Exception e) {
                LOGGER.debug("Tokenization for XML patent document has failed.");
              }

              // int nbTokens = st.countTokens();
              int nbTokens = tokenizations.size();
              int j = 0;
              for (String token : tokenizations) {
                // while (st.hasMoreTokens()) {
                // String token = st.nextToken().trim();
                if ((token.trim().length() == 0)
                    || (token.equals(" "))
                    || (token.equals("\t"))
                    || (token.equals("\n"))
                    || (token.equals("\r"))) {
                  continue;
                }

                if ((j > (nbTokens - N)) | (refFound & (j < N))) {
                  try {
                    accumulatedText.append(token + "\t" + "<other>\n");
                    allContent.append(token + " ");
                  } catch (Exception e) {
                    //										e.printStackTrace();
                    throw new GrobidException("An exception occured while running Grobid.", e);
                  }
                } else {
                  try {
                    accumulatedText.append(token + "\t" + "<ignore>\n");
                    allContent.append(token + " ");
                  } catch (Exception e) {
                    //										e.printStackTrace();
                    throw new GrobidException("An exception occured while running Grobid.", e);
                  }
                }
                j++;
              }

              accumulator.setLength(0);
              npl = false;
              ref = true;
            } else {
              System.out.println("Warning: unknown attribute value for ref or bibl: " + value);
              ref = false;
              npl = false;
            }
          }
        }
      }

      accumulatorRef.setLength(0);
    } else if (qName.equals("claim")) {
      accumulator.setLength(0);
    } else if (qName.equals("invention-title")) {
      accumulator.setLength(0);
    } else if (qName.equals("patcit")) {
      int length = atts.getLength();

      // Process each attribute
      for (int i = 0; i < length; i++) {
        // Get names and values for each attribute
        String name = atts.getQName(i);
        String value = atts.getValue(i);

        if (name != null) {
          if (name.equals("ucid")) {
            cited_number = value;
            // we normally need to normalize a little bit this patent nummer
          }
        }
      }
    }
  }
  public String getText() {
    String text = accumulator.toString();
    if (text.trim().length() == 0) {
      return "";
    }
    /*text = text.replace("\n", " ");
    text = text.replace("  ", " ");*/
    if (counting) {
      /*


      StringTokenizer st = new StringTokenizer(text, delimiters, true);
      int count = 0;

      while(st.hasMoreTokens()) {
      	String token = st.nextToken().trim();
      	if (token.length() == 0) {
      		continue;
      	}
      	count++;
      }
      */

      int i = currentPatentIndex;
      int count = text.length();

      while (i < patents.size()) {
        PatentItem currentPatent = patents.get(i);
        if (currentPatent != null) {
          int startOffset = currentPatent.getOffsetBegin();
          int endOffset = currentPatent.getOffsetEnd();

          if ((startOffset >= offset) && (endOffset <= offset + count)) {
            String context = currentPatent.getContext();

            /*System.out.println("OFFSET: " + offset);
            System.out.println("count: " + count);
            System.out.println("startOffset: " + startOffset);
            System.out.println("endOffset: " + endOffset);
            System.out.println("context: " + context);
            System.out.println("text: " + text);*/

            String target = "";
            if (context.charAt(0) == ' ') {
              target = " <ref type=\"patent\">" + context.substring(1, context.length()) + "</ref>";
            } else {
              target = "<ref type=\"patent\">" + context + "</ref>";
            }

            text = text.replace(context, target);
            currentPatentIndex = i;
          }
        }

        i++;
      }

      // i = currentArticleIndex;
      i = 0;
      while (i < articles.size()) {
        BibDataSet currentArticle = articles.get(i);
        if (currentArticle != null) {
          List<Integer> offsets = currentArticle.getOffsets();
          int startOffset = -1;
          int endOffset = -1;
          String context = currentArticle.getRawBib().trim();
          if (offsets.size() > 0) {
            if (offsets.get(0) != null) {
              startOffset = offsets.get(0).intValue();
              /*StringTokenizer stt = new StringTokenizer(context, delimiters, true);
              int count2 = 0;
              while(stt.hasMoreTokens()) {
              	String token2 = stt.nextToken().trim();
              	if (token2.length() == 0) {
              		continue;
              	}
              	count2++;
              }*/
              // endOffset = offsets.get(1).intValue();
              endOffset = startOffset + context.length();
            }
          }

          // if ( (startOffset >= offset) && (endOffset <= offset+count) ) {
          if ((startOffset >= offset)) {
            /*System.out.println("OFFSET: " + offset);
            System.out.println("count: " + count);
            System.out.println("startOffset: " + startOffset);
            System.out.println("endOffset: " + endOffset);
            System.out.println("context: " + context);
            System.out.println("text: " + text);*/

            String target = " <ref type=\"npl\">" + context + "</ref> ";
            text = text.replace(context, target);
            currentArticleIndex = i;
          }
        }

        i++;
      }

      offset += count;
    }

    return text;
  }