/** * This will invoke the <code>startElement</code> callback in the <code>ContentHandler</code>. * * @param element <code>Element</code> used in callbacks. * @param nsAtts <code>List</code> of namespaces to declare with the element or <code>null</code>. */ private void startElement(Element element, Attributes nsAtts) throws JDOMException { String namespaceURI = element.getNamespaceURI(); String localName = element.getName(); String rawName = element.getQualifiedName(); // Allocate attribute list. AttributesImpl atts = (nsAtts != null) ? new AttributesImpl(nsAtts) : new AttributesImpl(); List attributes = element.getAttributes(); Iterator i = attributes.iterator(); while (i.hasNext()) { Attribute a = (Attribute) i.next(); atts.addAttribute( a.getNamespaceURI(), a.getName(), a.getQualifiedName(), getAttributeTypeName(a.getAttributeType()), a.getValue()); } try { contentHandler.startElement(namespaceURI, localName, rawName, atts); } catch (SAXException se) { throw new JDOMException("Exception in startElement", se); } }
/** * This will take the supplied <code>{@link Element}</code> and transfer its namespaces to the * global namespace storage. * * @param element <code>Element</code> to read namespaces from. */ private void transferNamespaces(Element element) { Iterator i = declaredNamespaces.iterator(); while (i.hasNext()) { Namespace ns = (Namespace) i.next(); if (ns != element.getNamespace()) { element.addNamespaceDeclaration(ns); } } declaredNamespaces.clear(); }
/** * This will add the prefix mapping to the JDOM <code>Document</code> object. * * @param prefix <code>String</code> namespace prefix. * @param uri <code>String</code> namespace URI. */ public void startPrefixMapping(String prefix, String uri) throws SAXException { if (suppress) return; Namespace ns = Namespace.getNamespace(prefix, uri); declaredNamespaces.add(ns); }
/** * This will output a list of JDOM nodes as a fragment of an XML document, firing off the SAX * events that have been registered. * * <p><strong>Warning</strong>: This method does not call the {@link * ContentHandler#setDocumentLocator}, {@link ContentHandler#startDocument} and {@link * ContentHandler#endDocument} callbacks on the {@link #setContentHandler ContentHandler}. The * user shall invoke these methods directly prior/after outputting the document fragments. * * @param nodes <code>List</code> of JDOM nodes to output. * @throws JDOMException if any error occurred. * @see #outputFragment(org.jdom2.Content) */ public void outputFragment(List<? extends Content> nodes) throws JDOMException { if ((nodes == null) || (nodes.size() == 0)) { return; } // Output node list as a document fragment. elementContent(nodes, new NamespaceStack()); }
/** * This will invoke the <code>ContentHandler.startPrefixMapping</code> callback when a new * namespace is encountered in the <code>Document</code>. * * @param element <code>Element</code> used in callbacks. * @param namespaces <code>List</code> stack of Namespaces in scope. * @return <code>Attributes</code> declaring the namespaces local to <code>element</code> or * <code>null</code>. */ private Attributes startPrefixMapping(Element element, NamespaceStack namespaces) throws JDOMException { AttributesImpl nsAtts = null; // The namespaces as xmlns attributes Namespace ns = element.getNamespace(); if (ns != Namespace.XML_NAMESPACE) { String prefix = ns.getPrefix(); String uri = namespaces.getURI(prefix); if (!ns.getURI().equals(uri)) { namespaces.push(ns); nsAtts = this.addNsAttribute(nsAtts, ns); try { contentHandler.startPrefixMapping(prefix, ns.getURI()); } catch (SAXException se) { throw new JDOMException("Exception in startPrefixMapping", se); } } } // Fire additional namespace declarations List additionalNamespaces = element.getAdditionalNamespaces(); if (additionalNamespaces != null) { Iterator itr = additionalNamespaces.iterator(); while (itr.hasNext()) { ns = (Namespace) itr.next(); String prefix = ns.getPrefix(); String uri = namespaces.getURI(prefix); if (!ns.getURI().equals(uri)) { namespaces.push(ns); nsAtts = this.addNsAttribute(nsAtts, ns); try { contentHandler.startPrefixMapping(prefix, ns.getURI()); } catch (SAXException se) { throw new JDOMException("Exception in startPrefixMapping", se); } } } } return nsAtts; }
/** * This will invoke the callbacks for the content of an element. * * @param content element content as a <code>List</code> of nodes. * @param namespaces <code>List</code> stack of Namespaces in scope. */ private void elementContent(List content, NamespaceStack namespaces) throws JDOMException { for (Iterator i = content.iterator(); i.hasNext(); ) { Object obj = i.next(); if (obj instanceof Content) { this.elementContent((Content) obj, namespaces); } else { // Not a valid element child. This could happen with // application-provided lists which may contain non // JDOM objects. handleError(new JDOMException("Invalid element content: " + obj)); } } }
/** * Returns the content of a JDOM Element detached from it. * * @param elt the element to get the content from. * @return a (possibly empty) list of JDOM nodes, detached from their parent. */ private List getDetachedContent(Element elt) { List content = elt.getContent(); List nodes = new ArrayList(content.size()); while (content.size() != 0) { Object o = content.remove(0); nodes.add(o); } return (nodes); }
/** * This will output a list of JDOM nodes as a document, firing off the SAX events that have been * registered. * * <p><strong>Warning</strong>: This method may output ill-formed XML documents if the list * contains top-level objects that are not legal at the document level (e.g. Text or CDATA nodes, * multiple Element nodes, etc.). Thus, it should only be used to output document portions towards * ContentHandlers capable of accepting such ill-formed documents (such as XSLT processors). * * @param nodes <code>List</code> of JDOM nodes to output. * @throws JDOMException if any error occurred. * @see #output(org.jdom2.Document) */ public void output(List<? extends Content> nodes) throws JDOMException { if ((nodes == null) || (nodes.size() == 0)) { return; } // contentHandler.setDocumentLocator() documentLocator(null); // contentHandler.startDocument() startDocument(); // Process node list. elementContent(nodes, new NamespaceStack()); // contentHandler.endDocument() endDocument(); }
/** * Returns the result of an XSL Transformation as a list of JDOM nodes. * * <p>If the result of the transformation is a JDOM document, this method converts it into a list * of JDOM nodes; any subsequent call to {@link #getDocument} will return <code>null</code>. * * @return the transformation result as a (possibly empty) list of JDOM nodes (Elements, Texts, * Comments, PIs...). */ public List getResult() { List nodes = Collections.EMPTY_LIST; // Retrieve result from the document builder if not set. this.retrieveResult(); if (result instanceof List) { nodes = (List) result; } else { if ((result instanceof Document) && (queried == false)) { List content = ((Document) result).getContent(); nodes = new ArrayList(content.size()); while (content.size() != 0) { Object o = content.remove(0); nodes.add(o); } result = nodes; } } queried = true; return (nodes); }
/** * This reports the occurrence of an actual element. It will include the element's attributes, * with the exception of XML vocabulary specific attributes, such as <code> * xmlns:[namespace prefix]</code> and <code>xsi:schemaLocation</code>. * * @param namespaceURI <code>String</code> namespace URI this element is associated with, or an * empty <code>String</code> * @param localName <code>String</code> name of element (with no namespace prefix, if one is * present) * @param qName <code>String</code> XML 1.0 version of element name: [namespace * prefix]:[localName] * @param atts <code>Attributes</code> list for this element * @throws SAXException when things go wrong */ public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException { if (suppress) return; Element element = null; if ((namespaceURI != null) && (!namespaceURI.equals(""))) { String prefix = ""; // Determine any prefix on the Element if (!qName.equals(localName)) { int split = qName.indexOf(":"); prefix = qName.substring(0, split); } Namespace elementNamespace = Namespace.getNamespace(prefix, namespaceURI); element = factory.element(localName, elementNamespace); } else { element = factory.element(localName); } // Take leftover declared namespaces and add them to this element's // map of namespaces if (declaredNamespaces.size() > 0) { transferNamespaces(element); } // Handle attributes for (int i = 0, len = atts.getLength(); i < len; i++) { Attribute attribute = null; String attLocalName = atts.getLocalName(i); String attQName = atts.getQName(i); int attType = getAttributeType(atts.getType(i)); // Bypass any xmlns attributes which might appear, as we got // them already in startPrefixMapping(). // This is sometimes necessary when SAXHandler is used with // another source than SAXBuilder, as with JDOMResult. if (attQName.startsWith("xmlns:") || attQName.equals("xmlns")) { continue; } // First clause per http://markmail.org/message/2p245ggcjst27xe6 // patch from Mattias Jiderhamn if ("".equals(attLocalName) && attQName.indexOf(":") == -1) { attribute = factory.attribute(attQName, atts.getValue(i), attType); } else if (!attQName.equals(attLocalName)) { String attPrefix = attQName.substring(0, attQName.indexOf(":")); Namespace attNs = Namespace.getNamespace(attPrefix, atts.getURI(i)); attribute = factory.attribute(attLocalName, atts.getValue(i), attType, attNs); } else { attribute = factory.attribute(attLocalName, atts.getValue(i), attType); } factory.setAttribute(element, attribute); } flushCharacters(); if (atRoot) { document.setRootElement(element); // XXX should we use a factory call? atRoot = false; } else { factory.addContent(getCurrentElement(), element); } currentElement = element; }
public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException { if (qName.equals("patent-document") || qName.equals("fulltext-document")) { nbNPLRef = 0; nbPatentRef = 0; nbAllRef = 0; int length = atts.getLength(); // Process each attribute for (int i = 0; i < length; i++) { // Get names and values for each attribute String name = atts.getQName(i); String value = atts.getValue(i); if (name != null) { if (name.equals("lang")) { // Global_Language_Code = value.toLowerCase(); } if (name.equals("doc-number")) { PatentNumber = "EP" + value; } if (name.equals("kind")) { CodeType = value; } if (name.equals("date")) { PublicDate = value; } } } CitedPatentNumber = new ArrayList<String>(); accumulatedText = new StringBuffer(); allContent = new StringBuffer(); accumulator.setLength(0); } else if (qName.equals("description")) { accumulator.setLength(0); } else if (qName.equals("ref") || qName.equals("bibl")) { int length = atts.getLength(); nbAllRef++; // Process each attribute for (int i = 0; i < length; i++) { // Get names and values for each attribute String name = atts.getQName(i); String value = atts.getValue(i); if (name != null) { if (name.equals("type") || name.equals("typ")) { if (value.equals("npl") || value.equals("book") || value.equals("journal")) { String content = getText(); // we output what has been read so far in the description // we tokenize the text // ArrayList<String> tokens = // StringTokenizer st = new StringTokenizer(content, delimiters, true); List<String> tokenizations = new ArrayList<String>(); try { // TBD: pass a language object to the tokenize method call tokenizations = analyzer.tokenize(content); } catch (Exception e) { LOGGER.debug("Tokenization for XML patent document has failed."); } // int nbTokens = st.countTokens(); int nbTokens = tokenizations.size(); int j = 0; // while (st.hasMoreTokens()) { for (String token : tokenizations) { // String token = st.nextToken().trim(); if ((token.trim().length() == 0) || (token.equals(" ")) || (token.equals("\t")) || (token.equals("\n")) || (token.equals("\r"))) { continue; } if ((j > (nbTokens - N) && (N != -1)) || (refFound && (j < N) && (N != -1))) { try { accumulatedText.append(token + "\t" + "<other>\n"); allContent.append(token + " "); } catch (Exception e) { // e.printStackTrace(); throw new GrobidException("An exception occured while running Grobid.", e); } } else { try { accumulatedText.append(token + "\t" + "<ignore>\n"); allContent.append(token + " "); } catch (Exception e) { // e.printStackTrace(); throw new GrobidException("An exception occured while running Grobid.", e); } } j++; } accumulator.setLength(0); npl = true; ref = true; } else if (value.equals("patent") || value.equals("pl")) { String content = getText(); // we output what has been read so far in the description // we tokenize the text // ArrayList<String> tokens = // TextUtilities.segment(content,"[("+TextUtilities.punctuations); // StringTokenizer st = new StringTokenizer(content, delimiters, true); List<String> tokenizations = new ArrayList<String>(); try { // TBD: pass a language object to the tokenize method call tokenizations = analyzer.tokenize(content); } catch (Exception e) { LOGGER.debug("Tokenization for XML patent document has failed."); } // int nbTokens = st.countTokens(); int nbTokens = tokenizations.size(); int j = 0; for (String token : tokenizations) { // while (st.hasMoreTokens()) { // String token = st.nextToken().trim(); if ((token.trim().length() == 0) || (token.equals(" ")) || (token.equals("\t")) || (token.equals("\n")) || (token.equals("\r"))) { continue; } if ((j > (nbTokens - N)) | (refFound & (j < N))) { try { accumulatedText.append(token + "\t" + "<other>\n"); allContent.append(token + " "); } catch (Exception e) { // e.printStackTrace(); throw new GrobidException("An exception occured while running Grobid.", e); } } else { try { accumulatedText.append(token + "\t" + "<ignore>\n"); allContent.append(token + " "); } catch (Exception e) { // e.printStackTrace(); throw new GrobidException("An exception occured while running Grobid.", e); } } j++; } accumulator.setLength(0); npl = false; ref = true; } else { System.out.println("Warning: unknown attribute value for ref or bibl: " + value); ref = false; npl = false; } } } } accumulatorRef.setLength(0); } else if (qName.equals("claim")) { accumulator.setLength(0); } else if (qName.equals("invention-title")) { accumulator.setLength(0); } else if (qName.equals("patcit")) { int length = atts.getLength(); // Process each attribute for (int i = 0; i < length; i++) { // Get names and values for each attribute String name = atts.getQName(i); String value = atts.getValue(i); if (name != null) { if (name.equals("ucid")) { cited_number = value; // we normally need to normalize a little bit this patent nummer } } } } }
public void endElement(java.lang.String uri, java.lang.String localName, java.lang.String qName) throws SAXException { if (qName.equals("date")) { accumulator.setLength(0); } else if (qName.equals("ref") || qName.equals("bibl")) { String refString = getRefText(); refString = refString.replace("\n", " "); refString = refString.replace("\t", " "); refString = refString.replace(" ", " "); if (npl && ref) { if (referencesNPL == null) referencesNPL = new ArrayList<String>(); referencesNPL.add(refString); refFound = true; if (nplReferences) nbNPLRef++; } else if (ref) { if (referencesPatent == null) { referencesPatent = new HashMap<String, ArrayList<String>>(); } ArrayList<String> refss = referencesPatent.get(currentFileName); if (refss == null) { refss = new ArrayList<String>(); } refss.add(refString); referencesPatent.put(currentFileName, refss); refFound = true; if (patentReferences) { nbPatentRef++; } } if (refFound) { // we tokenize the text // ArrayList<String> tokens = TextUtilities.segment(refString, // "[("+TextUtilities.punctuations); // StringTokenizer st = new StringTokenizer(refString, delimiters, true); List<String> tokenizations = new ArrayList<String>(); try { // TBD: pass a language object to the tokenize method call tokenizations = analyzer.tokenize(refString); } catch (Exception e) { LOGGER.debug("Tokenization for XML patent document has failed."); } int i = 0; // String token = null; // for(String token : tokens) { // while (st.hasMoreTokens()) { for (String token : tokenizations) { // token = st.nextToken().trim(); if ((token.trim().length() == 0) || (token.equals(" ")) || (token.equals("\t")) || (token.equals("\n")) || (token.equals("\r"))) { continue; } try { accumulatedText.append(token + "\t"); allContent.append(token + " "); if (npl) { if (nplReferences) { if (i == 0) { // accumulatedText.append("refNPLBegin\n"); accumulatedText.append("I-<refNPL>\n"); } else if (token == null) { // accumulatedText.append("refNPLEnd\n"); accumulatedText.append("E-<refNPL>\n"); } else { accumulatedText.append("<refNPL>\n"); } } else accumulatedText.append("<other>\n"); } else { if (patentReferences) { if (i == 0) accumulatedText.append("I-<refPatent>\n"); else if (token == null) accumulatedText.append("E-<refPatent>\n"); else accumulatedText.append("<refPatent>\n"); } else accumulatedText.append("<other>\n"); } } catch (Exception e) { // e.printStackTrace(); throw new GrobidException("An exception occured while running Grobid.", e); } i++; } } ref = false; } else if (qName.equals("classification-ipcr")) { accumulator.setLength(0); } else if (qName.equals("classification-symbol")) { accumulator.setLength(0); } else if (qName.equals("abstract")) { accumulator.setLength(0); } else if (qName.equals("heading")) { accumulator.append(" "); } else if (qName.equals("description")) { if (refFound) { String content = getText(); // we tokenize the text // ArrayList<String> tokens = TextUtilities.segment(content, // "[("+TextUtilities.punctuations); // StringTokenizer st = new StringTokenizer(content, delimiters, true); List<String> tokenizations = new ArrayList<String>(); try { // TBD: pass a language object to the tokenize method call tokenizations = analyzer.tokenize(content); } catch (Exception e) { LOGGER.debug("Tokenization for XML patent document has failed."); } int i = 0; // String token = null; // for(String token : tokens) { // while (st.hasMoreTokens()) { for (String token : tokenizations) { // token = st.nextToken().trim(); if ((token.trim().length() == 0) || (token.equals(" ")) || (token.equals("\t")) || (token.equals("\n")) || (token.equals("\r"))) { continue; } // we print only a window of N words if ((i > N) && (N != -1)) { // break; token = token.trim(); if (token.length() > 0) { accumulatedText.append(token + "\t" + "<ignore>\n"); allContent.append(token + " "); } } else { try { token = token.trim(); if (token.length() > 0) { accumulatedText.append(token + "\t" + "<other>\n"); allContent.append(token + " "); } } catch (Exception e) { throw new GrobidException("An exception occured while running Grobid.", e); } } i++; } accumulator.setLength(0); refFound = false; } } else if (qName.equals("patcit")) { // we register the citation, the citation context will be marked in a later stage if (citations == null) citations = new ArrayList<String>(); citations.add(cited_number); accumulator.setLength(0); } else if (qName.equals("invention-title")) { accumulator.setLength(0); } else if (qName.equals("applicants")) { accumulator.setLength(0); } else if (qName.equals("inventors")) { accumulator.setLength(0); } else if (qName.equals("document-id")) { accumulator.setLength(0); } else if (qName.equals("legal-status")) { accumulator.setLength(0); } else if (qName.equals("bibliographic-data")) { accumulator.setLength(0); } else if (qName.equals("doc-number")) { accumulator.setLength(0); } else if (qName.equals("country")) { accumulator.setLength(0); } else if (qName.equals("kind")) { accumulator.setLength(0); } else if (qName.equals("classification-symbol")) { accumulator.setLength(0); } else if (qName.equals("classification-ecla")) { accumulator.setLength(0); } else if (qName.equals("patent-document") || qName.equals("fulltext-document")) { String allString = allContent.toString(); journalsPositions = lexicon.inJournalNames(allString); abbrevJournalsPositions = lexicon.inAbbrevJournalNames(allString); conferencesPositions = lexicon.inConferenceNames(allString); publishersPositions = lexicon.inPublisherNames(allString); allContent = null; allString = null; } else if (qName.equals("row")) { accumulator.append(" "); } else if (qName.equals("p")) { accumulator.append("\n"); } }
public String getText() { String text = accumulator.toString(); if (text.trim().length() == 0) { return ""; } /*text = text.replace("\n", " "); text = text.replace(" ", " ");*/ if (counting) { /* StringTokenizer st = new StringTokenizer(text, delimiters, true); int count = 0; while(st.hasMoreTokens()) { String token = st.nextToken().trim(); if (token.length() == 0) { continue; } count++; } */ int i = currentPatentIndex; int count = text.length(); while (i < patents.size()) { PatentItem currentPatent = patents.get(i); if (currentPatent != null) { int startOffset = currentPatent.getOffsetBegin(); int endOffset = currentPatent.getOffsetEnd(); if ((startOffset >= offset) && (endOffset <= offset + count)) { String context = currentPatent.getContext(); /*System.out.println("OFFSET: " + offset); System.out.println("count: " + count); System.out.println("startOffset: " + startOffset); System.out.println("endOffset: " + endOffset); System.out.println("context: " + context); System.out.println("text: " + text);*/ String target = ""; if (context.charAt(0) == ' ') { target = " <ref type=\"patent\">" + context.substring(1, context.length()) + "</ref>"; } else { target = "<ref type=\"patent\">" + context + "</ref>"; } text = text.replace(context, target); currentPatentIndex = i; } } i++; } // i = currentArticleIndex; i = 0; while (i < articles.size()) { BibDataSet currentArticle = articles.get(i); if (currentArticle != null) { List<Integer> offsets = currentArticle.getOffsets(); int startOffset = -1; int endOffset = -1; String context = currentArticle.getRawBib().trim(); if (offsets.size() > 0) { if (offsets.get(0) != null) { startOffset = offsets.get(0).intValue(); /*StringTokenizer stt = new StringTokenizer(context, delimiters, true); int count2 = 0; while(stt.hasMoreTokens()) { String token2 = stt.nextToken().trim(); if (token2.length() == 0) { continue; } count2++; }*/ // endOffset = offsets.get(1).intValue(); endOffset = startOffset + context.length(); } } // if ( (startOffset >= offset) && (endOffset <= offset+count) ) { if ((startOffset >= offset)) { /*System.out.println("OFFSET: " + offset); System.out.println("count: " + count); System.out.println("startOffset: " + startOffset); System.out.println("endOffset: " + endOffset); System.out.println("context: " + context); System.out.println("text: " + text);*/ String target = " <ref type=\"npl\">" + context + "</ref> "; text = text.replace(context, target); currentArticleIndex = i; } } i++; } offset += count; } return text; }