Esempio n. 1
0
  /**
   * Create the set of training and evaluation sets from the annotated examples with extraction of
   * citations in the patent description body.
   *
   * @param rank rank associated to the set for n-fold data generation
   * @param type type of data to be created, 0 is training data, 1 is evaluation data
   */
  public void createDataSet(
      String setName, String rank, String corpusPath, String outputPath, int type) {
    int nbFiles = 0;
    int nbNPLRef = 0;
    int nbPatentRef = 0;
    int maxRef = 0;
    try {
      // PATENT REF. textual data
      // we use a SAX parser on the patent XML files
      MarecSaxParser sax = new MarecSaxParser();
      sax.patentReferences = true;
      sax.nplReferences = false;
      int srCitations = 0;
      int previousSrCitations = 0;
      int withSR = 0;

      List<OffsetPosition> journalsPositions = null;
      List<OffsetPosition> abbrevJournalsPositions = null;
      List<OffsetPosition> conferencesPositions = null;
      List<OffsetPosition> publishersPositions = null;

      if (type == 0) {
        // training set
        sax.setN(trainWindow);
      } else {
        // for the test set we enlarge the focus window to include all the document.
        sax.setN(-1);
      }
      // get a factory
      /*SAXParserFactory spf = SAXParserFactory.newInstance();
      spf.setValidating(false);
      spf.setFeature("http://xml.org/sax/features/namespaces", false);
      spf.setFeature("http://xml.org/sax/features/validation", false);

      LinkedList<File> fileList = new LinkedList<File>();
      if (setName == null) {
          fileList.add(new File(corpusPath));
      } else if (rank == null) {
          fileList.add(new File(corpusPath));
      } else {
          // n-fold evaluation
          fileList.add(new File(corpusPath + File.separator + setName + "ing" + rank + File.separator));
      }
      Writer writer = null;
      if ((setName == null) || (setName.length() == 0)) {
          writer = new OutputStreamWriter(new FileOutputStream(
                  new File(outputPath + "/patent.train"), false), "UTF-8");
      } else if (rank == null) {
          writer = new OutputStreamWriter(new FileOutputStream(
                  new File(outputPath + "/patent." + setName), false), "UTF-8");
      } else {
          writer = new OutputStreamWriter(new FileOutputStream(
                  new File(outputPath + setName + "ing" + rank + "/patent." + setName), false), "UTF-8");
      }

      while (fileList.size() > 0) {
          File file = fileList.removeFirst();
          if (file.isDirectory()) {
              for (File subFile : file.listFiles())
                  fileList.addLast(subFile);
          } else {
              if (file.getName().endsWith(".xml")) {
                  nbFiles++;
                  System.out.println(file.getAbsolutePath());
                  try {
                      //get a new instance of parser
                      SAXParser p = spf.newSAXParser();
                      FileInputStream in = new FileInputStream(file);
                      sax.setFileName(file.getName());
                      p.parse(in, sax);
                      //writer1.write("\n");
                      nbPatentRef += sax.getNbPatentRef();
                      if (sax.citations != null) {
                          if (sax.citations.size() > previousSrCitations) {
                              previousSrCitations = sax.citations.size();
                              withSR++;
                          }
                      }
                      journalsPositions = sax.journalsPositions;
                      abbrevJournalsPositions = sax.abbrevJournalsPositions;
                      conferencesPositions = sax.conferencesPositions;
                      publishersPositions = sax.publishersPositions;

                      if (sax.accumulatedText != null) {
                          String text = sax.accumulatedText.toString();
                          if (text.trim().length() > 0) {
                              // add features for the patent tokens
                              addFeatures(text,
                                      writer,
                                      journalsPositions,
                                      abbrevJournalsPositions,
                                      conferencesPositions,
                                      publishersPositions);
                              writer.write("\n \n");
                          }
                      }
                  } catch (Exception e) {
                      throw new GrobidException("An exception occured while running Grobid.", e);
                  }
              }
          }
      }*/

      // NPL REF. textual data
      /*sax = new MarecSaxParser();
               sax.patentReferences = false;
               sax.nplReferences = true;

      if (type == 0) {
      	// training set
      	sax.setN(trainWindow);
      }
               else {
      	// for the test set we enlarge the focus window to include all the document.
                	sax.setN(-1);
              	}
               // get a factory
               spf = SAXParserFactory.newInstance();
               spf.setValidating(false);
               spf.setFeature("http://xml.org/sax/features/namespaces", false);
               spf.setFeature("http://xml.org/sax/features/validation", false);

               fileList = new LinkedList<File>();
               if (setName == null) {
                   fileList.add(new File(corpusPath));
               } else if (rank == null) {
                   fileList.add(new File(corpusPath));
               } else {
                   fileList.add(new File(corpusPath + File.separator + setName + "ing" + rank + File.separator));
               }
               if ((setName == null) || (setName.length() == 0)) {
                   writer = new OutputStreamWriter(new FileOutputStream(
                           new File(outputPath + "/npl.train"), false), "UTF-8");
               } else if (rank == null) {
                   writer = new OutputStreamWriter(new FileOutputStream(
                           new File(outputPath + "/npl." + setName), false), "UTF-8");
               } else {
                   writer = new OutputStreamWriter(new FileOutputStream(
                           new File(outputPath + File.separator + setName + "ing" + rank + File.separator +
      			"npl." + setName), false), "UTF-8");
               }
               while (fileList.size() > 0) {
                   File file = fileList.removeFirst();
                   if (file.isDirectory()) {
                       for (File subFile : file.listFiles())
                           fileList.addLast(subFile);
                   } else {
                       if (file.getName().endsWith(".xml")) {
                           //nbFiles++;
                           //String text = Files.readFromFile(file,"UTF-8");

                           try {
                               //get a new instance of parser
                               SAXParser p = spf.newSAXParser();
                               FileInputStream in = new FileInputStream(file);
                               sax.setFileName(file.toString());
                               p.parse(in, sax);
                               //writer2.write("\n");
                               nbNPLRef += sax.getNbNPLRef();
                               if (sax.nbAllRef > maxRef) {
                                   maxRef = sax.nbAllRef;
                               }
                               if (sax.citations != null) {
                                   if (sax.citations.size() > previousSrCitations) {
                                       previousSrCitations = sax.citations.size();
                                       withSR++;
                                   }
                               }
                               journalsPositions = sax.journalsPositions;
                               abbrevJournalsPositions = sax.abbrevJournalsPositions;
                               conferencesPositions = sax.conferencesPositions;
                               publishersPositions = sax.publishersPositions;
                               //totalLength += sax.totalLength;

                               if (sax.accumulatedText != null) {
                                   String text = sax.accumulatedText.toString();
                                   // add features for NPL
                                   addFeatures(text,
                                           writer,
                                           journalsPositions,
                                           abbrevJournalsPositions,
                                           conferencesPositions,
                                           publishersPositions);
                                   writer.write("\n");
                               }

                           } catch (Exception e) {
                               throw new GrobidException("An exception occured while running Grobid.", e);
                           }
                       }
                   }
               }

               if (sax.citations != null)
                   srCitations += sax.citations.size();*/

      // Patent + NPL REF. textual data (the "all" model)
      sax = new MarecSaxParser();
      sax.patentReferences = true;
      sax.nplReferences = true;

      if (type == 0) {
        // training set
        sax.setN(trainWindow);
      } else {
        // for the test set we enlarge the focus window to include all the document.
        sax.setN(-1);
      }
      // get a factory
      SAXParserFactory spf = SAXParserFactory.newInstance();
      spf.setValidating(false);
      spf.setFeature("http://xml.org/sax/features/namespaces", false);
      spf.setFeature("http://xml.org/sax/features/validation", false);

      LinkedList<File> fileList = new LinkedList<File>();
      if (setName == null) {
        fileList.add(new File(corpusPath));
      } else if (rank == null) {
        fileList.add(new File(corpusPath));
      } else {
        fileList.add(
            new File(corpusPath + File.separator + setName + "ing" + rank + File.separator));
      }

      Writer writer = null;
      if ((setName == null) || (setName.length() == 0)) {
        writer =
            new OutputStreamWriter(
                new FileOutputStream(new File(outputPath + File.separator + "all.train"), false),
                "UTF-8");
      } else if (rank == null) {
        writer =
            new OutputStreamWriter(
                new FileOutputStream(
                    new File(outputPath + File.separator + "all." + setName), false),
                "UTF-8");
      } else {
        writer =
            new OutputStreamWriter(
                new FileOutputStream(
                    new File(
                        outputPath
                            + File.separator
                            + setName
                            + "ing"
                            + rank
                            + File.separator
                            + "all."
                            + setName),
                    false),
                "UTF-8");
      }
      // int totalLength = 0;
      while (fileList.size() > 0) {
        File file = fileList.removeFirst();
        if (file.isDirectory()) {
          for (File subFile : file.listFiles()) {
            fileList.addLast(subFile);
          }
        } else {
          if (file.getName().endsWith(".xml")) {
            nbFiles++;
            try {
              // get a new instance of parser
              SAXParser p = spf.newSAXParser();
              FileInputStream in = new FileInputStream(file);
              sax.setFileName(file.toString());
              p.parse(in, sax);
              // writer3.write("\n");
              nbNPLRef += sax.getNbNPLRef();
              nbPatentRef += sax.getNbPatentRef();
              if (sax.nbAllRef > maxRef) {
                maxRef = sax.nbAllRef;
              }
              if (sax.citations != null) {
                if (sax.citations.size() > previousSrCitations) {
                  previousSrCitations = sax.citations.size();
                  withSR++;
                }
              }
              journalsPositions = sax.journalsPositions;
              abbrevJournalsPositions = sax.abbrevJournalsPositions;
              conferencesPositions = sax.conferencesPositions;
              publishersPositions = sax.publishersPositions;
              // totalLength += sax.totalLength;

              if (sax.accumulatedText != null) {
                String text = sax.accumulatedText.toString();
                // add features for patent+NPL
                addFeatures(
                    text,
                    writer,
                    journalsPositions,
                    abbrevJournalsPositions,
                    conferencesPositions,
                    publishersPositions);
                writer.write("\n");
              }
            } catch (Exception e) {
              throw new GrobidException("An exception occured while running Grobid.", e);
            }
          }
        }
      }

      if (sax.citations != null) {
        srCitations += sax.citations.size();
      }
      if (setName != null) {
        System.out.println(setName + "ing on " + nbFiles + " files");
      } else {
        System.out.println("training on " + nbFiles + " files");
      }
      // System.out.println("Number of file with search report: " + withSR);
      System.out.println("Number of references: " + (nbNPLRef + nbPatentRef));
      System.out.println("Number of patent references: " + nbPatentRef);
      System.out.println("Number of NPL references: " + nbNPLRef);
      // System.out.println("Number of search report citations: " + srCitations);
      System.out.println(
          "Average number of references: "
              + TextUtilities.formatTwoDecimals((double) (nbNPLRef + nbPatentRef) / nbFiles));
      System.out.println("Max number of references in file: " + maxRef);

      /*if ((setName == null) || (setName.length() == 0)) {
          System.out.println("patent data set under: " + outputPath + "/patent.train");
      } else {
          System.out.println("patent data set under: " + outputPath + "/patent." + setName);
      }
      if ((setName == null) || (setName.length() == 0)) {
          System.out.println("npl data set under: " + outputPath + "/npl.train");
      } else {
          System.out.println("npl data set under: " + outputPath + "/npl." + setName);
      }*/
      if ((setName == null) || (setName.length() == 0)) {
        System.out.println("common data set under: " + outputPath + "/all.train");
      } else {
        System.out.println("common data set under: " + outputPath + "/all." + setName);
      }
    } catch (Exception e) {
      throw new GrobidException("An exception occurred while running Grobid.", e);
    }
  }