Пример #1
0
  public static String saxElementToDebugString(String uri, String qName, Attributes attributes) {
    // Open start tag
    final StringBuilder sb = new StringBuilder("<");
    sb.append(qName);

    final Set<String> declaredPrefixes = new HashSet<String>();
    mapPrefixIfNeeded(declaredPrefixes, uri, qName, sb);

    // Attributes if any
    for (int i = 0; i < attributes.getLength(); i++) {
      mapPrefixIfNeeded(declaredPrefixes, attributes.getURI(i), attributes.getQName(i), sb);

      sb.append(' ');
      sb.append(attributes.getQName(i));
      sb.append("=\"");
      sb.append(attributes.getValue(i));
      sb.append('\"');
    }

    // Close start tag
    sb.append('>');

    // Content
    sb.append("[...]");

    // Close element with end tag
    sb.append("</");
    sb.append(qName);
    sb.append('>');

    return sb.toString();
  }
Пример #2
0
 /**
  * @param attributes source attributes
  * @return new AttributesImpl containing all attributes that were in src attributes and that were
  *     in the default name space.
  */
 public static AttributesImpl getAttributesFromDefaultNamespace(final Attributes attributes) {
   final AttributesImpl ret = new AttributesImpl();
   final int size = attributes.getLength();
   for (int i = 0; i < size; i++) {
     final String ns = attributes.getURI(i);
     if (!"".equals(ns)) continue;
     final String lnam = attributes.getLocalName(i);
     final String qnam = attributes.getQName(i);
     final String typ = attributes.getType(i);
     final String val = attributes.getValue(i);
     ret.addAttribute(ns, lnam, qnam, typ, val);
   }
   return ret;
 }
 protected void collectExtensionAttributes(Attributes attributes) {
   for (int i = 0; i < attributes.getLength(); i++) {
     String key = attributes.getURI(i);
     if (key.length() == 0
         || key.startsWith("http://www.osgi.org/xmlns/metatype/v")) // $NON-NLS-1$
     continue;
     Map<String, String> value = extensionAttributes.get(key);
     if (value == null) {
       value = new HashMap<String, String>();
       extensionAttributes.put(key, value);
     }
     value.put(
         getName(attributes.getLocalName(i), attributes.getQName(i)), attributes.getValue(i));
   }
 }
Пример #4
0
 private Map attributeMap(String tagName, Attributes atts) throws ParserException {
   if (null == tagName || null == atts) return null;
   Map mapping = null;
   try {
     mapping = (Map) attributeMaps.get(tagName);
   } catch (Exception e) {
     throw new ParserException(
         "Typecast error, unknown element found in attribute list mappings! " + e.getMessage());
   }
   if (null == mapping) return null;
   Map resultMapping = new HashMap();
   for (int i = 0; i < atts.getLength(); i++) {
     String xmlName = atts.getQName(i);
     String value = atts.getValue(i);
     TagMap.AttributeMapping aMap = null;
     try {
       aMap = (TagMap.AttributeMapping) mapping.get(xmlName);
     } catch (Exception e) {
       throw new ParserException(
           "Typecast error, unknown element found in property mapping! " + e.getMessage());
     }
     if (null == aMap)
       throw new ParserException(
           "No attribute mapping specified for attribute: " + xmlName + " in tag: " + tagName);
     String propertyName = aMap.getPropertyName();
     try {
       resultMapping.put(propertyName, aMap.convertValue(value));
     } catch (Exception e) {
       throw new ParserException(
           "Can not convert given value: \""
               + value
               + "\" to specified type: "
               + aMap.getType()
               + " for attribute: "
               + xmlName
               + " in tag: "
               + tagName
               + "! "
               + e.getMessage());
     }
   }
   checkForRequiredAttributes(tagName, resultMapping, mapping);
   addDefaultValues(resultMapping, mapping);
   return resultMapping;
 }
Пример #5
0
  public static AttributesImpl removeAttribute(
      Attributes attributes, String uri, String localname) {
    final AttributesImpl newAttributes = new AttributesImpl();
    for (int i = 0; i < attributes.getLength(); i++) {
      final String attributeURI = attributes.getURI(i);
      final String attributeValue = attributes.getValue(i);
      final String attributeType = attributes.getType(i);
      final String attributeQName = attributes.getQName(i);
      final String attributeLocalname = attributes.getLocalName(i);

      if (!uri.equals(attributeURI) || !localname.equals(attributeLocalname)) {
        // Not a matched attribute
        newAttributes.addAttribute(
            attributeURI, attributeLocalname, attributeQName, attributeType, attributeValue);
      }
    }
    return newAttributes;
  }
  public void startElement(String namespaceURI, String localName, String qName, Attributes atts)
      throws SAXException {
    try {
      // we output the remaining text
      if (!counting) {
        writer.write(getText());
        accumulator.setLength(0);
      }
      if (!counting) {
        writer.write("<" + qName);

        int length = atts.getLength();

        // Process each attribute
        for (int i = 0; i < length; i++) {
          // Get names and values for each attribute
          String name = atts.getQName(i);
          String value = atts.getValue(i);

          if ((name != null) && (value != null)) {
            writer.write(" " + name + "=\"" + value + "\"");
          }
        }

        writer.write(">");
      }

      if (qName.equals("description")) {
        offset = 0;
        counting = true;
      } else if (qName.equals("patent-document")) {
        counting = false;
      }
    } catch (Exception e) {
      //		    e.printStackTrace();
      throw new GrobidException("An exception occured while running Grobid.", e);
    }
  }
Пример #7
0
  public static AttributesImpl addOrReplaceAttribute(
      Attributes attributes, String uri, String prefix, String localname, String value) {
    final AttributesImpl newAttributes = new AttributesImpl();
    boolean replaced = false;
    for (int i = 0; i < attributes.getLength(); i++) {
      final String attributeURI = attributes.getURI(i);
      final String attributeValue = attributes.getValue(i);
      final String attributeType = attributes.getType(i);
      final String attributeQName = attributes.getQName(i);
      final String attributeLocalname = attributes.getLocalName(i);

      if (uri.equals(attributeURI) && localname.equals(attributeLocalname)) {
        // Found existing attribute
        replaced = true;
        newAttributes.addAttribute(
            uri,
            localname,
            XMLUtils.buildQName(prefix, localname),
            ContentHandlerHelper.CDATA,
            value);
      } else {
        // Not a matched attribute
        newAttributes.addAttribute(
            attributeURI, attributeLocalname, attributeQName, attributeType, attributeValue);
      }
    }
    if (!replaced) {
      // Attribute did not exist already so add it
      newAttributes.addAttribute(
          uri,
          localname,
          XMLUtils.buildQName(prefix, localname),
          ContentHandlerHelper.CDATA,
          value);
    }
    return newAttributes;
  }
Пример #8
0
  /**
   * This reports the occurrence of an actual element. It will include the element's attributes,
   * with the exception of XML vocabulary specific attributes, such as <code>
   * xmlns:[namespace prefix]</code> and <code>xsi:schemaLocation</code>.
   *
   * @param namespaceURI <code>String</code> namespace URI this element is associated with, or an
   *     empty <code>String</code>
   * @param localName <code>String</code> name of element (with no namespace prefix, if one is
   *     present)
   * @param qName <code>String</code> XML 1.0 version of element name: [namespace
   *     prefix]:[localName]
   * @param atts <code>Attributes</code> list for this element
   * @throws SAXException when things go wrong
   */
  public void startElement(String namespaceURI, String localName, String qName, Attributes atts)
      throws SAXException {
    if (suppress) return;

    Element element = null;

    if ((namespaceURI != null) && (!namespaceURI.equals(""))) {
      String prefix = "";

      // Determine any prefix on the Element
      if (!qName.equals(localName)) {
        int split = qName.indexOf(":");
        prefix = qName.substring(0, split);
      }
      Namespace elementNamespace = Namespace.getNamespace(prefix, namespaceURI);
      element = factory.element(localName, elementNamespace);
    } else {
      element = factory.element(localName);
    }

    // Take leftover declared namespaces and add them to this element's
    // map of namespaces
    if (declaredNamespaces.size() > 0) {
      transferNamespaces(element);
    }

    // Handle attributes
    for (int i = 0, len = atts.getLength(); i < len; i++) {
      Attribute attribute = null;

      String attLocalName = atts.getLocalName(i);
      String attQName = atts.getQName(i);
      int attType = getAttributeType(atts.getType(i));

      // Bypass any xmlns attributes which might appear, as we got
      // them already in startPrefixMapping().
      // This is sometimes necessary when SAXHandler is used with
      // another source than SAXBuilder, as with JDOMResult.
      if (attQName.startsWith("xmlns:") || attQName.equals("xmlns")) {
        continue;
      }

      // First clause per http://markmail.org/message/2p245ggcjst27xe6
      // patch from Mattias Jiderhamn
      if ("".equals(attLocalName) && attQName.indexOf(":") == -1) {
        attribute = factory.attribute(attQName, atts.getValue(i), attType);
      } else if (!attQName.equals(attLocalName)) {
        String attPrefix = attQName.substring(0, attQName.indexOf(":"));
        Namespace attNs = Namespace.getNamespace(attPrefix, atts.getURI(i));

        attribute = factory.attribute(attLocalName, atts.getValue(i), attType, attNs);
      } else {
        attribute = factory.attribute(attLocalName, atts.getValue(i), attType);
      }
      factory.setAttribute(element, attribute);
    }

    flushCharacters();

    if (atRoot) {
      document.setRootElement(element); // XXX should we use a factory call?
      atRoot = false;
    } else {
      factory.addContent(getCurrentElement(), element);
    }
    currentElement = element;
  }
Пример #9
0
  public void startElement(String namespaceURI, String localName, String qName, Attributes atts)
      throws SAXException {
    if (qName.equals("patent-document") || qName.equals("fulltext-document")) {
      nbNPLRef = 0;
      nbPatentRef = 0;
      nbAllRef = 0;
      int length = atts.getLength();

      // Process each attribute
      for (int i = 0; i < length; i++) {
        // Get names and values for each attribute
        String name = atts.getQName(i);
        String value = atts.getValue(i);

        if (name != null) {
          if (name.equals("lang")) {
            // Global_Language_Code = value.toLowerCase();
          }
          if (name.equals("doc-number")) {
            PatentNumber = "EP" + value;
          }
          if (name.equals("kind")) {
            CodeType = value;
          }
          if (name.equals("date")) {
            PublicDate = value;
          }
        }
      }

      CitedPatentNumber = new ArrayList<String>();
      accumulatedText = new StringBuffer();
      allContent = new StringBuffer();
      accumulator.setLength(0);
    } else if (qName.equals("description")) {
      accumulator.setLength(0);
    } else if (qName.equals("ref") || qName.equals("bibl")) {
      int length = atts.getLength();
      nbAllRef++;
      // Process each attribute
      for (int i = 0; i < length; i++) {
        // Get names and values for each attribute
        String name = atts.getQName(i);
        String value = atts.getValue(i);

        if (name != null) {
          if (name.equals("type") || name.equals("typ")) {
            if (value.equals("npl") || value.equals("book") || value.equals("journal")) {
              String content = getText();

              // we output what has been read so far in the description
              // we tokenize the text
              // ArrayList<String> tokens =
              // StringTokenizer st = new StringTokenizer(content, delimiters, true);
              List<String> tokenizations = new ArrayList<String>();
              try {
                // TBD: pass a language object to the tokenize method call
                tokenizations = analyzer.tokenize(content);
              } catch (Exception e) {
                LOGGER.debug("Tokenization for XML patent document has failed.");
              }

              // int nbTokens = st.countTokens();
              int nbTokens = tokenizations.size();
              int j = 0;
              // while (st.hasMoreTokens()) {
              for (String token : tokenizations) {
                // String token = st.nextToken().trim();
                if ((token.trim().length() == 0)
                    || (token.equals(" "))
                    || (token.equals("\t"))
                    || (token.equals("\n"))
                    || (token.equals("\r"))) {
                  continue;
                }

                if ((j > (nbTokens - N) && (N != -1)) || (refFound && (j < N) && (N != -1))) {
                  try {
                    accumulatedText.append(token + "\t" + "<other>\n");
                    allContent.append(token + " ");
                  } catch (Exception e) {
                    //										e.printStackTrace();
                    throw new GrobidException("An exception occured while running Grobid.", e);
                  }
                } else {
                  try {
                    accumulatedText.append(token + "\t" + "<ignore>\n");
                    allContent.append(token + " ");
                  } catch (Exception e) {
                    //										e.printStackTrace();
                    throw new GrobidException("An exception occured while running Grobid.", e);
                  }
                }
                j++;
              }

              accumulator.setLength(0);

              npl = true;
              ref = true;
            } else if (value.equals("patent") || value.equals("pl")) {
              String content = getText();

              // we output what has been read so far in the description
              // we tokenize the text
              // ArrayList<String> tokens =
              //	TextUtilities.segment(content,"[("+TextUtilities.punctuations);
              // StringTokenizer st = new StringTokenizer(content, delimiters, true);
              List<String> tokenizations = new ArrayList<String>();
              try {
                // TBD: pass a language object to the tokenize method call
                tokenizations = analyzer.tokenize(content);
              } catch (Exception e) {
                LOGGER.debug("Tokenization for XML patent document has failed.");
              }

              // int nbTokens = st.countTokens();
              int nbTokens = tokenizations.size();
              int j = 0;
              for (String token : tokenizations) {
                // while (st.hasMoreTokens()) {
                // String token = st.nextToken().trim();
                if ((token.trim().length() == 0)
                    || (token.equals(" "))
                    || (token.equals("\t"))
                    || (token.equals("\n"))
                    || (token.equals("\r"))) {
                  continue;
                }

                if ((j > (nbTokens - N)) | (refFound & (j < N))) {
                  try {
                    accumulatedText.append(token + "\t" + "<other>\n");
                    allContent.append(token + " ");
                  } catch (Exception e) {
                    //										e.printStackTrace();
                    throw new GrobidException("An exception occured while running Grobid.", e);
                  }
                } else {
                  try {
                    accumulatedText.append(token + "\t" + "<ignore>\n");
                    allContent.append(token + " ");
                  } catch (Exception e) {
                    //										e.printStackTrace();
                    throw new GrobidException("An exception occured while running Grobid.", e);
                  }
                }
                j++;
              }

              accumulator.setLength(0);
              npl = false;
              ref = true;
            } else {
              System.out.println("Warning: unknown attribute value for ref or bibl: " + value);
              ref = false;
              npl = false;
            }
          }
        }
      }

      accumulatorRef.setLength(0);
    } else if (qName.equals("claim")) {
      accumulator.setLength(0);
    } else if (qName.equals("invention-title")) {
      accumulator.setLength(0);
    } else if (qName.equals("patcit")) {
      int length = atts.getLength();

      // Process each attribute
      for (int i = 0; i < length; i++) {
        // Get names and values for each attribute
        String name = atts.getQName(i);
        String value = atts.getValue(i);

        if (name != null) {
          if (name.equals("ucid")) {
            cited_number = value;
            // we normally need to normalize a little bit this patent nummer
          }
        }
      }
    }
  }
Пример #10
0
  /** Receive notification of the start of an element. */
  @Override
  public void startElement(String uri, String l, String q, Attributes a) {
    /*
     * 1. Load a class that matches the element name.
     * 2. If no class found, assume the element maps to a String.
     * 3. Otherwise, construct a new object of the class with element attributes.
     */
    _logger.fine(
        S.fine(_logger)
            ? "Consider element " + l + "\n             uri " + uri + "\n               q " + q
            : null);
    ElementInfo info = new ElementInfo();

    // Record java packages defined on this element as xmlns
    for (int i = 0; i < a.getLength(); ++i) {
      _logger.fine(
          S.fine(_logger)
              ? "            attr "
                  + a.getQName(i)
                  + "="
                  + a.getValue(i)
                  + "\n                 "
                  + a.getQName(i)
                  + ":"
                  + a.getURI(i)
              : null);
      if (a.getQName(i).startsWith("xmlns:") && a.getValue(i).startsWith("java://")) {
        info.pkgs.put(a.getQName(i).substring(6), a.getValue(i).substring(7));
      }
    }

    // Resolve the package name of this element, which could be empty (default package)
    int colon = q.indexOf(':');
    if (colon > 0) {
      String xmlns = q.substring(0, colon);
      // is it defined right here?
      info.jpkg = info.pkgs.get(xmlns);
      // find a matching namespace from ancesters
      if (info.jpkg == null && !_stack.isEmpty()) {
        for (int i = _stack.size() - 1; i >= 0; --i) {
          info.jpkg = _stack.get(i).pkgs.get(xmlns);
          if (info.jpkg != null) {
            break;
          }
        }
      }
    } else if (isPrimitiveType(q)) {
      info.jpkg = "java.lang";
    } else if (!_stack.isEmpty()) {
      info.jpkg = _stack.get(_stack.size() - 1).jpkg;
    } else {
      info.jpkg = _jpkg;
    }

    _logger.fine("to create element with package = " + info.jpkg);
    try {
      info.name =
          (info.jpkg != null) ? info.jpkg + '.' + Strings.toCamelCase(l) : Strings.toCamelCase(l);
      try {
        if (info.name.endsWith("...")) {
          // Array construction
          info.type = Class.forName(info.name.substring(0, info.name.length() - 3));
          info.data = new ArrayList<Object>();
        } else {
          // Non-array construction
          int size = a.getLength();
          TypedValueGroup arguments = new TypedValueGroup();
          for (int i = 0; i < size; ++i) {
            if (!a.getQName(i).startsWith("xmlns:") && !a.getQName(i).equals("xmlns")) {
              arguments.add(guessUntypedValue(a.getQName(i), a.getValue(i)));
            }
          }
          arguments.complete();
          _logger.fine(S.fine(_logger) ? "arguments=" + arguments : null);

          if (arguments.size() > 0) {
            if (arguments.size() == 1 && "java.lang".equals(info.jpkg)) {
              info.inst.put(
                  "@as",
                  Strings.toCamelCase(
                      arguments.get(0).name, '-', false)); // respect original spelling
              info.data = arguments.get(0).get(0).data;
              info.type = arguments.get(0).get(0).type;
            } else {
              Exception last = null;
              Object[] args = new Object[arguments.size()];
              while (arguments.load(args, 0)) {
                try {
                  _logger.fine(
                      S.fine(_logger)
                          ? "to create " + info.name + " with args: " + args.length + args(args)
                          : null);
                  info.data = _factory.create(info.name, args);
                  info.type = info.data.getClass();
                  break;
                } catch (InvocationTargetException x) {
                  throw x;
                } catch (Exception x) {
                  last = x;
                  _logger.fine(
                      "failure in creating " + info.name + ": probing for other constructors");
                }
              }

              if (info.data == null) {
                throw last;
              }
            }
          } else {
            _logger.fine("Create " + info.name + " with the default constructor");
            info.data = _factory.create(info.name);
            info.type = info.data.getClass();
          }
        }
      } catch (ClassNotFoundException x) {
        // no class by the element name is found, assumed String
        if (!_lenient) {
          throw new BeanAssemblyException("No class associated with element " + q);
        } else {
          _logger.log(Level.WARNING, "can't find class " + info.name, x);
        }
      }
      _stack.add(info);
      // _logger.fine(">>ElementInfo: " + info.type.getName() + " in " + info);
      // all other exceptions indicate mismatches between the beans and the XML schema
    } catch (Exception x) {
      if (!_lenient) {
        throw new BeanAssemblyException("Failed to assemble bean from element " + q, x);
      } else {
        _logger.log(Level.SEVERE, "can't create object for this element", x);
      }
    }
  }
Пример #11
0
  public void startElement(String namespaceURI, String localName, String qName, Attributes atts)
      throws SAXException {
    if (qName.equals("PAGE")) {
      int length = atts.getLength();
      currentPage++;

      // Process each attribute
      for (int i = 0; i < length; i++) {
        // Get names and values for each attribute
        String name = atts.getQName(i);
        String value = atts.getValue(i);

        if ((name != null) && (value != null)) {
          if (name.equals("id")) {;
          } else if (name.equals("number")) {

          } else if (name.equals("width")) {

          } else if (name.equals("height")) {

          }
        }
      }

      /*
       * if (block != null) { blabla.append("\n");
       * tokenizations.add("\n"); block.setText(blabla.toString());
       * block.setNbTokens(nbTokens); doc.addBlock(block); } Block block0
       * = new Block(); block0.setText("@PAGE\n"); block0.setNbTokens(0);
       * doc.addBlock(block0);
       */
      /*
       * block = new Block(); blabla = new StringBuffer(); nbTokens = 0;
       * //blabla.append("\n@block\n"); tokenizations.add("\n");
       */
    } else if (qName.equals("BLOCK")) {
      block = new Block();
      blabla = new StringBuffer();
      nbTokens = 0;
      block.setPage(currentPage);
      // blabla.append("\n@block\n");
    } else if (qName.equals("IMAGE")) {
      int length = atts.getLength();

      // Process each attribute
      for (int i = 0; i < length; i++) {
        // Get names and values for each attribute
        String name = atts.getQName(i);
        String value = atts.getValue(i);

        if ((name != null) && (value != null)) {
          if (name.equals("href")) {
            // if (images == null)
            // images = new ArrayList<String>();
            images.add(value);
          } else if (name.equals("x")) {
            double x = Double.parseDouble(value);
            if (x != currentX) {
              currentX = x;
            }
          } else if (name.equals("y")) {
            double y = Double.parseDouble(value);
            if (y != currentY) {
              currentY = y;
            }
          } else if (name.equals("width")) {
            double width = Double.parseDouble(value);
            if (width != currentWidth) {
              currentWidth = width;
            }
          } else if (name.equals("height")) {
            double height = Double.parseDouble(value);
            if (height != currentHeight) {
              currentHeight = height;
            }
          }
        }
      }

    } else if (qName.equals("TEXT")) {
      int length = atts.getLength();

      // Process each attribute
      for (int i = 0; i < length; i++) {
        // Get names and values for each attribute
        String name = atts.getQName(i);
        String value = atts.getValue(i);

        if ((name != null) && (value != null)) {
          if (name.equals("id")) {

          } else if (name.equals("x")) {

          } else if (name.equals("y")) {

          } else if (name.equals("width")) {

          } else if (name.equals("height")) {

          }
        }
      }
    } else if (qName.equals("TOKEN")) {
      int length = atts.getLength();

      // Process each attribute
      for (int i = 0; i < length; i++) {
        // Get names and values for each attribute
        String name = atts.getQName(i);
        String value = atts.getValue(i);

        if ((name != null) && (value != null)) {
          if (name.equals("id")) {;
          } else if (name.equals("font-name")) {
            if (!value.equals(currentFont)) {
              currentFont = value;
              blabla.append(" ");
            }
          } else if (name.equals("font-size")) {
            double fontSize = Double.parseDouble(value);
            if (fontSize != currentFontSize) {
              currentFontSize = fontSize;

              blabla.append(" ");
            }
          } else if (name.equals("bold")) {
            if (value.equals("yes")) {
              currentBold = true;
            } else {
              currentBold = false;
            }
          } else if (name.equals("italic")) {
            if (value.equals("yes")) {
              currentItalic = true;
            } else {
              currentItalic = false;
            }
          } else if (name.equals("font-color")) {
            if (!value.equals(colorFont)) {
              colorFont = value;
            }
          } else if (name.equals("rotation")) {
            if (value.equals("0")) currentRotation = false;
            else currentRotation = true;
          } else if (name.equals("x")) {
            double x = Double.parseDouble(value);
            if (x != currentX) {
              currentX = x;
            }
          } else if (name.equals("y")) {
            double y = Double.parseDouble(value);
            if (y != currentY) {
              currentY = y;
            }
          } else if (name.equals("base")) {
            double base = Double.parseDouble(value);

          } else if (name.equals("width")) {
            double width = Double.parseDouble(value);
            if (width != currentWidth) {
              currentWidth = width;
            }
          } else if (name.equals("height")) {
            double height = Double.parseDouble(value);
            if (height != currentHeight) {
              currentHeight = height;
            }
          }
        }
      }
    } else if (qName.equals("xi:include")) {
      int length = atts.getLength();

      // Process each attribute
      for (int i = 0; i < length; i++) {
        // Get names and values for each attribute
        String name = atts.getQName(i);
        String value = atts.getValue(i);

        if ((name != null) && (value != null)) {
          if (name.equals("href")) {
            // if (images == null)
            // images = new ArrayList<String>();
            images.add(value);
          }
        }
      }
    }
    // accumulator.setLength(0);
  }