/** * Create the set of training and evaluation sets from the annotated examples with extraction of * citations in the patent description body. * * @param rank rank associated to the set for n-fold data generation * @param type type of data to be created, 0 is training data, 1 is evaluation data */ public void createDataSet( String setName, String rank, String corpusPath, String outputPath, int type) { int nbFiles = 0; int nbNPLRef = 0; int nbPatentRef = 0; int maxRef = 0; try { // PATENT REF. textual data // we use a SAX parser on the patent XML files MarecSaxParser sax = new MarecSaxParser(); sax.patentReferences = true; sax.nplReferences = false; int srCitations = 0; int previousSrCitations = 0; int withSR = 0; List<OffsetPosition> journalsPositions = null; List<OffsetPosition> abbrevJournalsPositions = null; List<OffsetPosition> conferencesPositions = null; List<OffsetPosition> publishersPositions = null; if (type == 0) { // training set sax.setN(trainWindow); } else { // for the test set we enlarge the focus window to include all the document. sax.setN(-1); } // get a factory /*SAXParserFactory spf = SAXParserFactory.newInstance(); spf.setValidating(false); spf.setFeature("http://xml.org/sax/features/namespaces", false); spf.setFeature("http://xml.org/sax/features/validation", false); LinkedList<File> fileList = new LinkedList<File>(); if (setName == null) { fileList.add(new File(corpusPath)); } else if (rank == null) { fileList.add(new File(corpusPath)); } else { // n-fold evaluation fileList.add(new File(corpusPath + File.separator + setName + "ing" + rank + File.separator)); } Writer writer = null; if ((setName == null) || (setName.length() == 0)) { writer = new OutputStreamWriter(new FileOutputStream( new File(outputPath + "/patent.train"), false), "UTF-8"); } else if (rank == null) { writer = new OutputStreamWriter(new FileOutputStream( new File(outputPath + "/patent." + setName), false), "UTF-8"); } else { writer = new OutputStreamWriter(new FileOutputStream( new File(outputPath + setName + "ing" + rank + "/patent." + setName), false), "UTF-8"); } while (fileList.size() > 0) { File file = fileList.removeFirst(); if (file.isDirectory()) { for (File subFile : file.listFiles()) fileList.addLast(subFile); } else { if (file.getName().endsWith(".xml")) { nbFiles++; System.out.println(file.getAbsolutePath()); try { //get a new instance of parser SAXParser p = spf.newSAXParser(); FileInputStream in = new FileInputStream(file); sax.setFileName(file.getName()); p.parse(in, sax); //writer1.write("\n"); nbPatentRef += sax.getNbPatentRef(); if (sax.citations != null) { if (sax.citations.size() > previousSrCitations) { previousSrCitations = sax.citations.size(); withSR++; } } journalsPositions = sax.journalsPositions; abbrevJournalsPositions = sax.abbrevJournalsPositions; conferencesPositions = sax.conferencesPositions; publishersPositions = sax.publishersPositions; if (sax.accumulatedText != null) { String text = sax.accumulatedText.toString(); if (text.trim().length() > 0) { // add features for the patent tokens addFeatures(text, writer, journalsPositions, abbrevJournalsPositions, conferencesPositions, publishersPositions); writer.write("\n \n"); } } } catch (Exception e) { throw new GrobidException("An exception occured while running Grobid.", e); } } } }*/ // NPL REF. textual data /*sax = new MarecSaxParser(); sax.patentReferences = false; sax.nplReferences = true; if (type == 0) { // training set sax.setN(trainWindow); } else { // for the test set we enlarge the focus window to include all the document. sax.setN(-1); } // get a factory spf = SAXParserFactory.newInstance(); spf.setValidating(false); spf.setFeature("http://xml.org/sax/features/namespaces", false); spf.setFeature("http://xml.org/sax/features/validation", false); fileList = new LinkedList<File>(); if (setName == null) { fileList.add(new File(corpusPath)); } else if (rank == null) { fileList.add(new File(corpusPath)); } else { fileList.add(new File(corpusPath + File.separator + setName + "ing" + rank + File.separator)); } if ((setName == null) || (setName.length() == 0)) { writer = new OutputStreamWriter(new FileOutputStream( new File(outputPath + "/npl.train"), false), "UTF-8"); } else if (rank == null) { writer = new OutputStreamWriter(new FileOutputStream( new File(outputPath + "/npl." + setName), false), "UTF-8"); } else { writer = new OutputStreamWriter(new FileOutputStream( new File(outputPath + File.separator + setName + "ing" + rank + File.separator + "npl." + setName), false), "UTF-8"); } while (fileList.size() > 0) { File file = fileList.removeFirst(); if (file.isDirectory()) { for (File subFile : file.listFiles()) fileList.addLast(subFile); } else { if (file.getName().endsWith(".xml")) { //nbFiles++; //String text = Files.readFromFile(file,"UTF-8"); try { //get a new instance of parser SAXParser p = spf.newSAXParser(); FileInputStream in = new FileInputStream(file); sax.setFileName(file.toString()); p.parse(in, sax); //writer2.write("\n"); nbNPLRef += sax.getNbNPLRef(); if (sax.nbAllRef > maxRef) { maxRef = sax.nbAllRef; } if (sax.citations != null) { if (sax.citations.size() > previousSrCitations) { previousSrCitations = sax.citations.size(); withSR++; } } journalsPositions = sax.journalsPositions; abbrevJournalsPositions = sax.abbrevJournalsPositions; conferencesPositions = sax.conferencesPositions; publishersPositions = sax.publishersPositions; //totalLength += sax.totalLength; if (sax.accumulatedText != null) { String text = sax.accumulatedText.toString(); // add features for NPL addFeatures(text, writer, journalsPositions, abbrevJournalsPositions, conferencesPositions, publishersPositions); writer.write("\n"); } } catch (Exception e) { throw new GrobidException("An exception occured while running Grobid.", e); } } } } if (sax.citations != null) srCitations += sax.citations.size();*/ // Patent + NPL REF. textual data (the "all" model) sax = new MarecSaxParser(); sax.patentReferences = true; sax.nplReferences = true; if (type == 0) { // training set sax.setN(trainWindow); } else { // for the test set we enlarge the focus window to include all the document. sax.setN(-1); } // get a factory SAXParserFactory spf = SAXParserFactory.newInstance(); spf.setValidating(false); spf.setFeature("http://xml.org/sax/features/namespaces", false); spf.setFeature("http://xml.org/sax/features/validation", false); LinkedList<File> fileList = new LinkedList<File>(); if (setName == null) { fileList.add(new File(corpusPath)); } else if (rank == null) { fileList.add(new File(corpusPath)); } else { fileList.add( new File(corpusPath + File.separator + setName + "ing" + rank + File.separator)); } Writer writer = null; if ((setName == null) || (setName.length() == 0)) { writer = new OutputStreamWriter( new FileOutputStream(new File(outputPath + File.separator + "all.train"), false), "UTF-8"); } else if (rank == null) { writer = new OutputStreamWriter( new FileOutputStream( new File(outputPath + File.separator + "all." + setName), false), "UTF-8"); } else { writer = new OutputStreamWriter( new FileOutputStream( new File( outputPath + File.separator + setName + "ing" + rank + File.separator + "all." + setName), false), "UTF-8"); } // int totalLength = 0; while (fileList.size() > 0) { File file = fileList.removeFirst(); if (file.isDirectory()) { for (File subFile : file.listFiles()) { fileList.addLast(subFile); } } else { if (file.getName().endsWith(".xml")) { nbFiles++; try { // get a new instance of parser SAXParser p = spf.newSAXParser(); FileInputStream in = new FileInputStream(file); sax.setFileName(file.toString()); p.parse(in, sax); // writer3.write("\n"); nbNPLRef += sax.getNbNPLRef(); nbPatentRef += sax.getNbPatentRef(); if (sax.nbAllRef > maxRef) { maxRef = sax.nbAllRef; } if (sax.citations != null) { if (sax.citations.size() > previousSrCitations) { previousSrCitations = sax.citations.size(); withSR++; } } journalsPositions = sax.journalsPositions; abbrevJournalsPositions = sax.abbrevJournalsPositions; conferencesPositions = sax.conferencesPositions; publishersPositions = sax.publishersPositions; // totalLength += sax.totalLength; if (sax.accumulatedText != null) { String text = sax.accumulatedText.toString(); // add features for patent+NPL addFeatures( text, writer, journalsPositions, abbrevJournalsPositions, conferencesPositions, publishersPositions); writer.write("\n"); } } catch (Exception e) { throw new GrobidException("An exception occured while running Grobid.", e); } } } } if (sax.citations != null) { srCitations += sax.citations.size(); } if (setName != null) { System.out.println(setName + "ing on " + nbFiles + " files"); } else { System.out.println("training on " + nbFiles + " files"); } // System.out.println("Number of file with search report: " + withSR); System.out.println("Number of references: " + (nbNPLRef + nbPatentRef)); System.out.println("Number of patent references: " + nbPatentRef); System.out.println("Number of NPL references: " + nbNPLRef); // System.out.println("Number of search report citations: " + srCitations); System.out.println( "Average number of references: " + TextUtilities.formatTwoDecimals((double) (nbNPLRef + nbPatentRef) / nbFiles)); System.out.println("Max number of references in file: " + maxRef); /*if ((setName == null) || (setName.length() == 0)) { System.out.println("patent data set under: " + outputPath + "/patent.train"); } else { System.out.println("patent data set under: " + outputPath + "/patent." + setName); } if ((setName == null) || (setName.length() == 0)) { System.out.println("npl data set under: " + outputPath + "/npl.train"); } else { System.out.println("npl data set under: " + outputPath + "/npl." + setName); }*/ if ((setName == null) || (setName.length() == 0)) { System.out.println("common data set under: " + outputPath + "/all.train"); } else { System.out.println("common data set under: " + outputPath + "/all." + setName); } } catch (Exception e) { throw new GrobidException("An exception occurred while running Grobid.", e); } }