/** * Returns the content of a JDOM Element detached from it. * * @param elt the element to get the content from. * @return a (possibly empty) list of JDOM nodes, detached from their parent. */ private List getDetachedContent(Element elt) { List content = elt.getContent(); List nodes = new ArrayList(content.size()); while (content.size() != 0) { Object o = content.remove(0); nodes.add(o); } return (nodes); }
/** * This will output a list of JDOM nodes as a fragment of an XML document, firing off the SAX * events that have been registered. * * <p><strong>Warning</strong>: This method does not call the {@link * ContentHandler#setDocumentLocator}, {@link ContentHandler#startDocument} and {@link * ContentHandler#endDocument} callbacks on the {@link #setContentHandler ContentHandler}. The * user shall invoke these methods directly prior/after outputting the document fragments. * * @param nodes <code>List</code> of JDOM nodes to output. * @throws JDOMException if any error occurred. * @see #outputFragment(org.jdom2.Content) */ public void outputFragment(List<? extends Content> nodes) throws JDOMException { if ((nodes == null) || (nodes.size() == 0)) { return; } // Output node list as a document fragment. elementContent(nodes, new NamespaceStack()); }
/** * Returns the result of an XSL Transformation as a list of JDOM nodes. * * <p>If the result of the transformation is a JDOM document, this method converts it into a list * of JDOM nodes; any subsequent call to {@link #getDocument} will return <code>null</code>. * * @return the transformation result as a (possibly empty) list of JDOM nodes (Elements, Texts, * Comments, PIs...). */ public List getResult() { List nodes = Collections.EMPTY_LIST; // Retrieve result from the document builder if not set. this.retrieveResult(); if (result instanceof List) { nodes = (List) result; } else { if ((result instanceof Document) && (queried == false)) { List content = ((Document) result).getContent(); nodes = new ArrayList(content.size()); while (content.size() != 0) { Object o = content.remove(0); nodes.add(o); } result = nodes; } } queried = true; return (nodes); }
/** * This will output a list of JDOM nodes as a document, firing off the SAX events that have been * registered. * * <p><strong>Warning</strong>: This method may output ill-formed XML documents if the list * contains top-level objects that are not legal at the document level (e.g. Text or CDATA nodes, * multiple Element nodes, etc.). Thus, it should only be used to output document portions towards * ContentHandlers capable of accepting such ill-formed documents (such as XSLT processors). * * @param nodes <code>List</code> of JDOM nodes to output. * @throws JDOMException if any error occurred. * @see #output(org.jdom2.Document) */ public void output(List<? extends Content> nodes) throws JDOMException { if ((nodes == null) || (nodes.size() == 0)) { return; } // contentHandler.setDocumentLocator() documentLocator(null); // contentHandler.startDocument() startDocument(); // Process node list. elementContent(nodes, new NamespaceStack()); // contentHandler.endDocument() endDocument(); }
/** * This reports the occurrence of an actual element. It will include the element's attributes, * with the exception of XML vocabulary specific attributes, such as <code> * xmlns:[namespace prefix]</code> and <code>xsi:schemaLocation</code>. * * @param namespaceURI <code>String</code> namespace URI this element is associated with, or an * empty <code>String</code> * @param localName <code>String</code> name of element (with no namespace prefix, if one is * present) * @param qName <code>String</code> XML 1.0 version of element name: [namespace * prefix]:[localName] * @param atts <code>Attributes</code> list for this element * @throws SAXException when things go wrong */ public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException { if (suppress) return; Element element = null; if ((namespaceURI != null) && (!namespaceURI.equals(""))) { String prefix = ""; // Determine any prefix on the Element if (!qName.equals(localName)) { int split = qName.indexOf(":"); prefix = qName.substring(0, split); } Namespace elementNamespace = Namespace.getNamespace(prefix, namespaceURI); element = factory.element(localName, elementNamespace); } else { element = factory.element(localName); } // Take leftover declared namespaces and add them to this element's // map of namespaces if (declaredNamespaces.size() > 0) { transferNamespaces(element); } // Handle attributes for (int i = 0, len = atts.getLength(); i < len; i++) { Attribute attribute = null; String attLocalName = atts.getLocalName(i); String attQName = atts.getQName(i); int attType = getAttributeType(atts.getType(i)); // Bypass any xmlns attributes which might appear, as we got // them already in startPrefixMapping(). // This is sometimes necessary when SAXHandler is used with // another source than SAXBuilder, as with JDOMResult. if (attQName.startsWith("xmlns:") || attQName.equals("xmlns")) { continue; } // First clause per http://markmail.org/message/2p245ggcjst27xe6 // patch from Mattias Jiderhamn if ("".equals(attLocalName) && attQName.indexOf(":") == -1) { attribute = factory.attribute(attQName, atts.getValue(i), attType); } else if (!attQName.equals(attLocalName)) { String attPrefix = attQName.substring(0, attQName.indexOf(":")); Namespace attNs = Namespace.getNamespace(attPrefix, atts.getURI(i)); attribute = factory.attribute(attLocalName, atts.getValue(i), attType, attNs); } else { attribute = factory.attribute(attLocalName, atts.getValue(i), attType); } factory.setAttribute(element, attribute); } flushCharacters(); if (atRoot) { document.setRootElement(element); // XXX should we use a factory call? atRoot = false; } else { factory.addContent(getCurrentElement(), element); } currentElement = element; }
public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException { if (qName.equals("patent-document") || qName.equals("fulltext-document")) { nbNPLRef = 0; nbPatentRef = 0; nbAllRef = 0; int length = atts.getLength(); // Process each attribute for (int i = 0; i < length; i++) { // Get names and values for each attribute String name = atts.getQName(i); String value = atts.getValue(i); if (name != null) { if (name.equals("lang")) { // Global_Language_Code = value.toLowerCase(); } if (name.equals("doc-number")) { PatentNumber = "EP" + value; } if (name.equals("kind")) { CodeType = value; } if (name.equals("date")) { PublicDate = value; } } } CitedPatentNumber = new ArrayList<String>(); accumulatedText = new StringBuffer(); allContent = new StringBuffer(); accumulator.setLength(0); } else if (qName.equals("description")) { accumulator.setLength(0); } else if (qName.equals("ref") || qName.equals("bibl")) { int length = atts.getLength(); nbAllRef++; // Process each attribute for (int i = 0; i < length; i++) { // Get names and values for each attribute String name = atts.getQName(i); String value = atts.getValue(i); if (name != null) { if (name.equals("type") || name.equals("typ")) { if (value.equals("npl") || value.equals("book") || value.equals("journal")) { String content = getText(); // we output what has been read so far in the description // we tokenize the text // ArrayList<String> tokens = // StringTokenizer st = new StringTokenizer(content, delimiters, true); List<String> tokenizations = new ArrayList<String>(); try { // TBD: pass a language object to the tokenize method call tokenizations = analyzer.tokenize(content); } catch (Exception e) { LOGGER.debug("Tokenization for XML patent document has failed."); } // int nbTokens = st.countTokens(); int nbTokens = tokenizations.size(); int j = 0; // while (st.hasMoreTokens()) { for (String token : tokenizations) { // String token = st.nextToken().trim(); if ((token.trim().length() == 0) || (token.equals(" ")) || (token.equals("\t")) || (token.equals("\n")) || (token.equals("\r"))) { continue; } if ((j > (nbTokens - N) && (N != -1)) || (refFound && (j < N) && (N != -1))) { try { accumulatedText.append(token + "\t" + "<other>\n"); allContent.append(token + " "); } catch (Exception e) { // e.printStackTrace(); throw new GrobidException("An exception occured while running Grobid.", e); } } else { try { accumulatedText.append(token + "\t" + "<ignore>\n"); allContent.append(token + " "); } catch (Exception e) { // e.printStackTrace(); throw new GrobidException("An exception occured while running Grobid.", e); } } j++; } accumulator.setLength(0); npl = true; ref = true; } else if (value.equals("patent") || value.equals("pl")) { String content = getText(); // we output what has been read so far in the description // we tokenize the text // ArrayList<String> tokens = // TextUtilities.segment(content,"[("+TextUtilities.punctuations); // StringTokenizer st = new StringTokenizer(content, delimiters, true); List<String> tokenizations = new ArrayList<String>(); try { // TBD: pass a language object to the tokenize method call tokenizations = analyzer.tokenize(content); } catch (Exception e) { LOGGER.debug("Tokenization for XML patent document has failed."); } // int nbTokens = st.countTokens(); int nbTokens = tokenizations.size(); int j = 0; for (String token : tokenizations) { // while (st.hasMoreTokens()) { // String token = st.nextToken().trim(); if ((token.trim().length() == 0) || (token.equals(" ")) || (token.equals("\t")) || (token.equals("\n")) || (token.equals("\r"))) { continue; } if ((j > (nbTokens - N)) | (refFound & (j < N))) { try { accumulatedText.append(token + "\t" + "<other>\n"); allContent.append(token + " "); } catch (Exception e) { // e.printStackTrace(); throw new GrobidException("An exception occured while running Grobid.", e); } } else { try { accumulatedText.append(token + "\t" + "<ignore>\n"); allContent.append(token + " "); } catch (Exception e) { // e.printStackTrace(); throw new GrobidException("An exception occured while running Grobid.", e); } } j++; } accumulator.setLength(0); npl = false; ref = true; } else { System.out.println("Warning: unknown attribute value for ref or bibl: " + value); ref = false; npl = false; } } } } accumulatorRef.setLength(0); } else if (qName.equals("claim")) { accumulator.setLength(0); } else if (qName.equals("invention-title")) { accumulator.setLength(0); } else if (qName.equals("patcit")) { int length = atts.getLength(); // Process each attribute for (int i = 0; i < length; i++) { // Get names and values for each attribute String name = atts.getQName(i); String value = atts.getValue(i); if (name != null) { if (name.equals("ucid")) { cited_number = value; // we normally need to normalize a little bit this patent nummer } } } } }
public String getText() { String text = accumulator.toString(); if (text.trim().length() == 0) { return ""; } /*text = text.replace("\n", " "); text = text.replace(" ", " ");*/ if (counting) { /* StringTokenizer st = new StringTokenizer(text, delimiters, true); int count = 0; while(st.hasMoreTokens()) { String token = st.nextToken().trim(); if (token.length() == 0) { continue; } count++; } */ int i = currentPatentIndex; int count = text.length(); while (i < patents.size()) { PatentItem currentPatent = patents.get(i); if (currentPatent != null) { int startOffset = currentPatent.getOffsetBegin(); int endOffset = currentPatent.getOffsetEnd(); if ((startOffset >= offset) && (endOffset <= offset + count)) { String context = currentPatent.getContext(); /*System.out.println("OFFSET: " + offset); System.out.println("count: " + count); System.out.println("startOffset: " + startOffset); System.out.println("endOffset: " + endOffset); System.out.println("context: " + context); System.out.println("text: " + text);*/ String target = ""; if (context.charAt(0) == ' ') { target = " <ref type=\"patent\">" + context.substring(1, context.length()) + "</ref>"; } else { target = "<ref type=\"patent\">" + context + "</ref>"; } text = text.replace(context, target); currentPatentIndex = i; } } i++; } // i = currentArticleIndex; i = 0; while (i < articles.size()) { BibDataSet currentArticle = articles.get(i); if (currentArticle != null) { List<Integer> offsets = currentArticle.getOffsets(); int startOffset = -1; int endOffset = -1; String context = currentArticle.getRawBib().trim(); if (offsets.size() > 0) { if (offsets.get(0) != null) { startOffset = offsets.get(0).intValue(); /*StringTokenizer stt = new StringTokenizer(context, delimiters, true); int count2 = 0; while(stt.hasMoreTokens()) { String token2 = stt.nextToken().trim(); if (token2.length() == 0) { continue; } count2++; }*/ // endOffset = offsets.get(1).intValue(); endOffset = startOffset + context.length(); } } // if ( (startOffset >= offset) && (endOffset <= offset+count) ) { if ((startOffset >= offset)) { /*System.out.println("OFFSET: " + offset); System.out.println("count: " + count); System.out.println("startOffset: " + startOffset); System.out.println("endOffset: " + endOffset); System.out.println("context: " + context); System.out.println("text: " + text);*/ String target = " <ref type=\"npl\">" + context + "</ref> "; text = text.replace(context, target); currentArticleIndex = i; } } i++; } offset += count; } return text; }