Beispiel #1
   * This will invoke the <code>startElement</code> callback in the <code>ContentHandler</code>.
   * @param element <code>Element</code> used in callbacks.
   * @param nsAtts <code>List</code> of namespaces to declare with the element or <code>null</code>.
  private void startElement(Element element, Attributes nsAtts) throws JDOMException {
    String namespaceURI = element.getNamespaceURI();
    String localName = element.getName();
    String rawName = element.getQualifiedName();

    // Allocate attribute list.
    AttributesImpl atts = (nsAtts != null) ? new AttributesImpl(nsAtts) : new AttributesImpl();

    List attributes = element.getAttributes();
    Iterator i = attributes.iterator();
    while (i.hasNext()) {
      Attribute a = (Attribute);

    try {
      contentHandler.startElement(namespaceURI, localName, rawName, atts);
    } catch (SAXException se) {
      throw new JDOMException("Exception in startElement", se);
Beispiel #2
  * This will take the supplied <code>{@link Element}</code> and transfer its namespaces to the
  * global namespace storage.
  * @param element <code>Element</code> to read namespaces from.
 private void transferNamespaces(Element element) {
   Iterator i = declaredNamespaces.iterator();
   while (i.hasNext()) {
     Namespace ns = (Namespace);
     if (ns != element.getNamespace()) {
Beispiel #3
   * This will add the prefix mapping to the JDOM <code>Document</code> object.
   * @param prefix <code>String</code> namespace prefix.
   * @param uri <code>String</code> namespace URI.
  public void startPrefixMapping(String prefix, String uri) throws SAXException {

    if (suppress) return;

    Namespace ns = Namespace.getNamespace(prefix, uri);
Beispiel #4
   * This will output a list of JDOM nodes as a fragment of an XML document, firing off the SAX
   * events that have been registered.
   * <p><strong>Warning</strong>: This method does not call the {@link
   * ContentHandler#setDocumentLocator}, {@link ContentHandler#startDocument} and {@link
   * ContentHandler#endDocument} callbacks on the {@link #setContentHandler ContentHandler}. The
   * user shall invoke these methods directly prior/after outputting the document fragments.
   * @param nodes <code>List</code> of JDOM nodes to output.
   * @throws JDOMException if any error occurred.
   * @see #outputFragment(org.jdom2.Content)
  public void outputFragment(List<? extends Content> nodes) throws JDOMException {
    if ((nodes == null) || (nodes.size() == 0)) {

    // Output node list as a document fragment.
    elementContent(nodes, new NamespaceStack());
Beispiel #5
   * This will invoke the <code>ContentHandler.startPrefixMapping</code> callback when a new
   * namespace is encountered in the <code>Document</code>.
   * @param element <code>Element</code> used in callbacks.
   * @param namespaces <code>List</code> stack of Namespaces in scope.
   * @return <code>Attributes</code> declaring the namespaces local to <code>element</code> or
   *     <code>null</code>.
  private Attributes startPrefixMapping(Element element, NamespaceStack namespaces)
      throws JDOMException {
    AttributesImpl nsAtts = null; // The namespaces as xmlns attributes

    Namespace ns = element.getNamespace();
    if (ns != Namespace.XML_NAMESPACE) {
      String prefix = ns.getPrefix();
      String uri = namespaces.getURI(prefix);
      if (!ns.getURI().equals(uri)) {
        nsAtts = this.addNsAttribute(nsAtts, ns);
        try {
          contentHandler.startPrefixMapping(prefix, ns.getURI());
        } catch (SAXException se) {
          throw new JDOMException("Exception in startPrefixMapping", se);

    // Fire additional namespace declarations
    List additionalNamespaces = element.getAdditionalNamespaces();
    if (additionalNamespaces != null) {
      Iterator itr = additionalNamespaces.iterator();
      while (itr.hasNext()) {
        ns = (Namespace);
        String prefix = ns.getPrefix();
        String uri = namespaces.getURI(prefix);
        if (!ns.getURI().equals(uri)) {
          nsAtts = this.addNsAttribute(nsAtts, ns);
          try {
            contentHandler.startPrefixMapping(prefix, ns.getURI());
          } catch (SAXException se) {
            throw new JDOMException("Exception in startPrefixMapping", se);
    return nsAtts;
Beispiel #6
   * This will invoke the callbacks for the content of an element.
   * @param content element content as a <code>List</code> of nodes.
   * @param namespaces <code>List</code> stack of Namespaces in scope.
  private void elementContent(List content, NamespaceStack namespaces) throws JDOMException {
    for (Iterator i = content.iterator(); i.hasNext(); ) {
      Object obj =;

      if (obj instanceof Content) {
        this.elementContent((Content) obj, namespaces);
      } else {
        // Not a valid element child. This could happen with
        // application-provided lists which may contain non
        // JDOM objects.
        handleError(new JDOMException("Invalid element content: " + obj));
Beispiel #7
     * Returns the content of a JDOM Element detached from it.
     * @param elt the element to get the content from.
     * @return a (possibly empty) list of JDOM nodes, detached from their parent.
    private List getDetachedContent(Element elt) {
      List content = elt.getContent();
      List nodes = new ArrayList(content.size());

      while (content.size() != 0) {
        Object o = content.remove(0);
      return (nodes);
Beispiel #8
   * This will output a list of JDOM nodes as a document, firing off the SAX events that have been
   * registered.
   * <p><strong>Warning</strong>: This method may output ill-formed XML documents if the list
   * contains top-level objects that are not legal at the document level (e.g. Text or CDATA nodes,
   * multiple Element nodes, etc.). Thus, it should only be used to output document portions towards
   * ContentHandlers capable of accepting such ill-formed documents (such as XSLT processors).
   * @param nodes <code>List</code> of JDOM nodes to output.
   * @throws JDOMException if any error occurred.
   * @see #output(org.jdom2.Document)
  public void output(List<? extends Content> nodes) throws JDOMException {
    if ((nodes == null) || (nodes.size() == 0)) {

    // contentHandler.setDocumentLocator()

    // contentHandler.startDocument()

    // Process node list.
    elementContent(nodes, new NamespaceStack());

    // contentHandler.endDocument()
Beispiel #9
   * Returns the result of an XSL Transformation as a list of JDOM nodes.
   * <p>If the result of the transformation is a JDOM document, this method converts it into a list
   * of JDOM nodes; any subsequent call to {@link #getDocument} will return <code>null</code>.
   * @return the transformation result as a (possibly empty) list of JDOM nodes (Elements, Texts,
   *     Comments, PIs...).
  public List getResult() {
    List nodes = Collections.EMPTY_LIST;

    // Retrieve result from the document builder if not set.

    if (result instanceof List) {
      nodes = (List) result;
    } else {
      if ((result instanceof Document) && (queried == false)) {
        List content = ((Document) result).getContent();
        nodes = new ArrayList(content.size());

        while (content.size() != 0) {
          Object o = content.remove(0);
        result = nodes;
    queried = true;

    return (nodes);
Beispiel #10
   * This reports the occurrence of an actual element. It will include the element's attributes,
   * with the exception of XML vocabulary specific attributes, such as <code>
   * xmlns:[namespace prefix]</code> and <code>xsi:schemaLocation</code>.
   * @param namespaceURI <code>String</code> namespace URI this element is associated with, or an
   *     empty <code>String</code>
   * @param localName <code>String</code> name of element (with no namespace prefix, if one is
   *     present)
   * @param qName <code>String</code> XML 1.0 version of element name: [namespace
   *     prefix]:[localName]
   * @param atts <code>Attributes</code> list for this element
   * @throws SAXException when things go wrong
  public void startElement(String namespaceURI, String localName, String qName, Attributes atts)
      throws SAXException {
    if (suppress) return;

    Element element = null;

    if ((namespaceURI != null) && (!namespaceURI.equals(""))) {
      String prefix = "";

      // Determine any prefix on the Element
      if (!qName.equals(localName)) {
        int split = qName.indexOf(":");
        prefix = qName.substring(0, split);
      Namespace elementNamespace = Namespace.getNamespace(prefix, namespaceURI);
      element = factory.element(localName, elementNamespace);
    } else {
      element = factory.element(localName);

    // Take leftover declared namespaces and add them to this element's
    // map of namespaces
    if (declaredNamespaces.size() > 0) {

    // Handle attributes
    for (int i = 0, len = atts.getLength(); i < len; i++) {
      Attribute attribute = null;

      String attLocalName = atts.getLocalName(i);
      String attQName = atts.getQName(i);
      int attType = getAttributeType(atts.getType(i));

      // Bypass any xmlns attributes which might appear, as we got
      // them already in startPrefixMapping().
      // This is sometimes necessary when SAXHandler is used with
      // another source than SAXBuilder, as with JDOMResult.
      if (attQName.startsWith("xmlns:") || attQName.equals("xmlns")) {

      // First clause per
      // patch from Mattias Jiderhamn
      if ("".equals(attLocalName) && attQName.indexOf(":") == -1) {
        attribute = factory.attribute(attQName, atts.getValue(i), attType);
      } else if (!attQName.equals(attLocalName)) {
        String attPrefix = attQName.substring(0, attQName.indexOf(":"));
        Namespace attNs = Namespace.getNamespace(attPrefix, atts.getURI(i));

        attribute = factory.attribute(attLocalName, atts.getValue(i), attType, attNs);
      } else {
        attribute = factory.attribute(attLocalName, atts.getValue(i), attType);
      factory.setAttribute(element, attribute);


    if (atRoot) {
      document.setRootElement(element); // XXX should we use a factory call?
      atRoot = false;
    } else {
      factory.addContent(getCurrentElement(), element);
    currentElement = element;
Beispiel #11
  public void startElement(String namespaceURI, String localName, String qName, Attributes atts)
      throws SAXException {
    if (qName.equals("patent-document") || qName.equals("fulltext-document")) {
      nbNPLRef = 0;
      nbPatentRef = 0;
      nbAllRef = 0;
      int length = atts.getLength();

      // Process each attribute
      for (int i = 0; i < length; i++) {
        // Get names and values for each attribute
        String name = atts.getQName(i);
        String value = atts.getValue(i);

        if (name != null) {
          if (name.equals("lang")) {
            // Global_Language_Code = value.toLowerCase();
          if (name.equals("doc-number")) {
            PatentNumber = "EP" + value;
          if (name.equals("kind")) {
            CodeType = value;
          if (name.equals("date")) {
            PublicDate = value;

      CitedPatentNumber = new ArrayList<String>();
      accumulatedText = new StringBuffer();
      allContent = new StringBuffer();
    } else if (qName.equals("description")) {
    } else if (qName.equals("ref") || qName.equals("bibl")) {
      int length = atts.getLength();
      // Process each attribute
      for (int i = 0; i < length; i++) {
        // Get names and values for each attribute
        String name = atts.getQName(i);
        String value = atts.getValue(i);

        if (name != null) {
          if (name.equals("type") || name.equals("typ")) {
            if (value.equals("npl") || value.equals("book") || value.equals("journal")) {
              String content = getText();

              // we output what has been read so far in the description
              // we tokenize the text
              // ArrayList<String> tokens =
              // StringTokenizer st = new StringTokenizer(content, delimiters, true);
              List<String> tokenizations = new ArrayList<String>();
              try {
                // TBD: pass a language object to the tokenize method call
                tokenizations = analyzer.tokenize(content);
              } catch (Exception e) {
                LOGGER.debug("Tokenization for XML patent document has failed.");

              // int nbTokens = st.countTokens();
              int nbTokens = tokenizations.size();
              int j = 0;
              // while (st.hasMoreTokens()) {
              for (String token : tokenizations) {
                // String token = st.nextToken().trim();
                if ((token.trim().length() == 0)
                    || (token.equals(" "))
                    || (token.equals("\t"))
                    || (token.equals("\n"))
                    || (token.equals("\r"))) {

                if ((j > (nbTokens - N) && (N != -1)) || (refFound && (j < N) && (N != -1))) {
                  try {
                    accumulatedText.append(token + "\t" + "<other>\n");
                    allContent.append(token + " ");
                  } catch (Exception e) {
                    //										e.printStackTrace();
                    throw new GrobidException("An exception occured while running Grobid.", e);
                } else {
                  try {
                    accumulatedText.append(token + "\t" + "<ignore>\n");
                    allContent.append(token + " ");
                  } catch (Exception e) {
                    //										e.printStackTrace();
                    throw new GrobidException("An exception occured while running Grobid.", e);


              npl = true;
              ref = true;
            } else if (value.equals("patent") || value.equals("pl")) {
              String content = getText();

              // we output what has been read so far in the description
              // we tokenize the text
              // ArrayList<String> tokens =
              //	TextUtilities.segment(content,"[("+TextUtilities.punctuations);
              // StringTokenizer st = new StringTokenizer(content, delimiters, true);
              List<String> tokenizations = new ArrayList<String>();
              try {
                // TBD: pass a language object to the tokenize method call
                tokenizations = analyzer.tokenize(content);
              } catch (Exception e) {
                LOGGER.debug("Tokenization for XML patent document has failed.");

              // int nbTokens = st.countTokens();
              int nbTokens = tokenizations.size();
              int j = 0;
              for (String token : tokenizations) {
                // while (st.hasMoreTokens()) {
                // String token = st.nextToken().trim();
                if ((token.trim().length() == 0)
                    || (token.equals(" "))
                    || (token.equals("\t"))
                    || (token.equals("\n"))
                    || (token.equals("\r"))) {

                if ((j > (nbTokens - N)) | (refFound & (j < N))) {
                  try {
                    accumulatedText.append(token + "\t" + "<other>\n");
                    allContent.append(token + " ");
                  } catch (Exception e) {
                    //										e.printStackTrace();
                    throw new GrobidException("An exception occured while running Grobid.", e);
                } else {
                  try {
                    accumulatedText.append(token + "\t" + "<ignore>\n");
                    allContent.append(token + " ");
                  } catch (Exception e) {
                    //										e.printStackTrace();
                    throw new GrobidException("An exception occured while running Grobid.", e);

              npl = false;
              ref = true;
            } else {
              System.out.println("Warning: unknown attribute value for ref or bibl: " + value);
              ref = false;
              npl = false;

    } else if (qName.equals("claim")) {
    } else if (qName.equals("invention-title")) {
    } else if (qName.equals("patcit")) {
      int length = atts.getLength();

      // Process each attribute
      for (int i = 0; i < length; i++) {
        // Get names and values for each attribute
        String name = atts.getQName(i);
        String value = atts.getValue(i);

        if (name != null) {
          if (name.equals("ucid")) {
            cited_number = value;
            // we normally need to normalize a little bit this patent nummer
Beispiel #12
  public void endElement(java.lang.String uri, java.lang.String localName, java.lang.String qName)
      throws SAXException {
    if (qName.equals("date")) {
    } else if (qName.equals("ref") || qName.equals("bibl")) {
      String refString = getRefText();
      refString = refString.replace("\n", " ");
      refString = refString.replace("\t", " ");
      refString = refString.replace("  ", " ");

      if (npl && ref) {
        if (referencesNPL == null) referencesNPL = new ArrayList<String>();
        refFound = true;
        if (nplReferences) nbNPLRef++;
      } else if (ref) {
        if (referencesPatent == null) {
          referencesPatent = new HashMap<String, ArrayList<String>>();
        ArrayList<String> refss = referencesPatent.get(currentFileName);

        if (refss == null) {
          refss = new ArrayList<String>();

        referencesPatent.put(currentFileName, refss);
        refFound = true;
        if (patentReferences) {

      if (refFound) {
        // we tokenize the text
        // ArrayList<String> tokens = TextUtilities.segment(refString,
        // "[("+TextUtilities.punctuations);
        // StringTokenizer st = new StringTokenizer(refString, delimiters, true);
        List<String> tokenizations = new ArrayList<String>();
        try {
          // TBD: pass a language object to the tokenize method call
          tokenizations = analyzer.tokenize(refString);
        } catch (Exception e) {
          LOGGER.debug("Tokenization for XML patent document has failed.");

        int i = 0;
        // String token = null;
        // for(String token : tokens) {
        // while (st.hasMoreTokens()) {
        for (String token : tokenizations) {
          // token = st.nextToken().trim();
          if ((token.trim().length() == 0)
              || (token.equals(" "))
              || (token.equals("\t"))
              || (token.equals("\n"))
              || (token.equals("\r"))) {
          try {
            accumulatedText.append(token + "\t");
            allContent.append(token + " ");
            if (npl) {
              if (nplReferences) {
                if (i == 0) {
                  // accumulatedText.append("refNPLBegin\n");
                } else if (token == null) {
                  // accumulatedText.append("refNPLEnd\n");
                } else {
              } else accumulatedText.append("<other>\n");
            } else {
              if (patentReferences) {
                if (i == 0) accumulatedText.append("I-<refPatent>\n");
                else if (token == null) accumulatedText.append("E-<refPatent>\n");
                else accumulatedText.append("<refPatent>\n");
              } else accumulatedText.append("<other>\n");
          } catch (Exception e) {
            //						e.printStackTrace();
            throw new GrobidException("An exception occured while running Grobid.", e);
      ref = false;
    } else if (qName.equals("classification-ipcr")) {
    } else if (qName.equals("classification-symbol")) {
    } else if (qName.equals("abstract")) {
    } else if (qName.equals("heading")) {
      accumulator.append(" ");
    } else if (qName.equals("description")) {
      if (refFound) {
        String content = getText();

        // we tokenize the text
        // ArrayList<String> tokens = TextUtilities.segment(content,
        // "[("+TextUtilities.punctuations);
        // StringTokenizer st = new StringTokenizer(content, delimiters, true);
        List<String> tokenizations = new ArrayList<String>();
        try {
          // TBD: pass a language object to the tokenize method call
          tokenizations = analyzer.tokenize(content);
        } catch (Exception e) {
          LOGGER.debug("Tokenization for XML patent document has failed.");

        int i = 0;
        // String token = null;
        // for(String token : tokens) {
        // while (st.hasMoreTokens()) {
        for (String token : tokenizations) {
          // token = st.nextToken().trim();
          if ((token.trim().length() == 0)
              || (token.equals(" "))
              || (token.equals("\t"))
              || (token.equals("\n"))
              || (token.equals("\r"))) {
          // we print only a window of N words
          if ((i > N) && (N != -1)) {
            // break;
            token = token.trim();
            if (token.length() > 0) {
              accumulatedText.append(token + "\t" + "<ignore>\n");
              allContent.append(token + " ");
          } else {
            try {
              token = token.trim();
              if (token.length() > 0) {
                accumulatedText.append(token + "\t" + "<other>\n");
                allContent.append(token + " ");
            } catch (Exception e) {
              throw new GrobidException("An exception occured while running Grobid.", e);

        refFound = false;
    } else if (qName.equals("patcit")) {
      // we register the citation, the citation context will be marked in a later stage
      if (citations == null) citations = new ArrayList<String>();
    } else if (qName.equals("invention-title")) {
    } else if (qName.equals("applicants")) {
    } else if (qName.equals("inventors")) {
    } else if (qName.equals("document-id")) {
    } else if (qName.equals("legal-status")) {
    } else if (qName.equals("bibliographic-data")) {
    } else if (qName.equals("doc-number")) {
    } else if (qName.equals("country")) {
    } else if (qName.equals("kind")) {
    } else if (qName.equals("classification-symbol")) {
    } else if (qName.equals("classification-ecla")) {
    } else if (qName.equals("patent-document") || qName.equals("fulltext-document")) {
      String allString = allContent.toString();
      journalsPositions = lexicon.inJournalNames(allString);
      abbrevJournalsPositions = lexicon.inAbbrevJournalNames(allString);
      conferencesPositions = lexicon.inConferenceNames(allString);
      publishersPositions = lexicon.inPublisherNames(allString);
      allContent = null;
      allString = null;
    } else if (qName.equals("row")) {
      accumulator.append(" ");
    } else if (qName.equals("p")) {
  public String getText() {
    String text = accumulator.toString();
    if (text.trim().length() == 0) {
      return "";
    /*text = text.replace("\n", " ");
    text = text.replace("  ", " ");*/
    if (counting) {

      StringTokenizer st = new StringTokenizer(text, delimiters, true);
      int count = 0;

      while(st.hasMoreTokens()) {
      	String token = st.nextToken().trim();
      	if (token.length() == 0) {

      int i = currentPatentIndex;
      int count = text.length();

      while (i < patents.size()) {
        PatentItem currentPatent = patents.get(i);
        if (currentPatent != null) {
          int startOffset = currentPatent.getOffsetBegin();
          int endOffset = currentPatent.getOffsetEnd();

          if ((startOffset >= offset) && (endOffset <= offset + count)) {
            String context = currentPatent.getContext();

            /*System.out.println("OFFSET: " + offset);
            System.out.println("count: " + count);
            System.out.println("startOffset: " + startOffset);
            System.out.println("endOffset: " + endOffset);
            System.out.println("context: " + context);
            System.out.println("text: " + text);*/

            String target = "";
            if (context.charAt(0) == ' ') {
              target = " <ref type=\"patent\">" + context.substring(1, context.length()) + "</ref>";
            } else {
              target = "<ref type=\"patent\">" + context + "</ref>";

            text = text.replace(context, target);
            currentPatentIndex = i;


      // i = currentArticleIndex;
      i = 0;
      while (i < articles.size()) {
        BibDataSet currentArticle = articles.get(i);
        if (currentArticle != null) {
          List<Integer> offsets = currentArticle.getOffsets();
          int startOffset = -1;
          int endOffset = -1;
          String context = currentArticle.getRawBib().trim();
          if (offsets.size() > 0) {
            if (offsets.get(0) != null) {
              startOffset = offsets.get(0).intValue();
              /*StringTokenizer stt = new StringTokenizer(context, delimiters, true);
              int count2 = 0;
              while(stt.hasMoreTokens()) {
              	String token2 = stt.nextToken().trim();
              	if (token2.length() == 0) {
              // endOffset = offsets.get(1).intValue();
              endOffset = startOffset + context.length();

          // if ( (startOffset >= offset) && (endOffset <= offset+count) ) {
          if ((startOffset >= offset)) {
            /*System.out.println("OFFSET: " + offset);
            System.out.println("count: " + count);
            System.out.println("startOffset: " + startOffset);
            System.out.println("endOffset: " + endOffset);
            System.out.println("context: " + context);
            System.out.println("text: " + text);*/

            String target = " <ref type=\"npl\">" + context + "</ref> ";
            text = text.replace(context, target);
            currentArticleIndex = i;


      offset += count;

    return text;