Exemplo n.º 1
0
  // preprocessing of data, in other words, extracting useful data from XML and writing them to csv
  // file
  public dataSending preprocessing(String fileName, Map<String, ReverseRedirects> map) {
    String filePath = fileName;
    StringBuilder sb, redirectHelp;
    XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance();
    long i = 0,
        allRedirects = 0,
        allDisamb = 0,
        deadRedirects = 0,
        nameRedirects = 0,
        deadDisamb = 0,
        allDisambRedirects = 0;
    long start, end, time = 0;
    Preprocessing Current = new Preprocessing();
    ReverseRedirects help;

    start = System.currentTimeMillis();
    String pattern = "(\\[\\[[^\\]\\]]*\\]\\])";

    Pattern r = Pattern.compile(pattern);

    try {

      XMLEventReader xmlEventReader =
          xmlInputFactory.createXMLEventReader(new FileInputStream(filePath));
      while (xmlEventReader.hasNext()) {
        XMLEvent xmlEvent = xmlEventReader.nextEvent();

        if (xmlEvent.isStartElement()) {
          StartElement startElement = xmlEvent.asStartElement();
          /** **********FOUND NEW PAGE, insert data if there are any to csv********* */
          if (startElement.getName().getLocalPart().toLowerCase().equals("page")) {
            i++;

            if (i != 1) {
              sb = new StringBuilder();
              sb.append(i);
              if (Current.title != null) sb.append("," + Current.title);
              else sb.append(", ");
              if (Current.redirTitle != null) sb.append("," + Current.redirTitle);
              else sb.append(", ");
              if (Current.ambig) sb.append(",Y");
              else sb.append(",N");

              sb.append("," + Current.numberRedirects);
              for (int j = 0; j < Current.numberRedirects; j++) {
                //	sb.append(","+Current.redirects.get(j));
              }
              // if(Current.numberRedirects == 0 || Current.redirects!=null ||
              // Current.redirects.size() == 0) {
              sb.append(",no");
              // }
              fileSave.addLine(sb.toString());
            }

            Current.ambig = false;
            Current.ambigs = null;
            Current.ambigs = new ArrayList<String>();
            Current.id = 0;
            Current.redirects = null;
            Current.redirects = new ArrayList<String>();
            Current.redirTitle = null;
            Current.title = null;
            Current.numberRedirects = 0;
            Current.numberAmbigs = 0;
          }
          /** ********Tag title found, store its value to variable***** */
          else if (startElement.getName().getLocalPart().toLowerCase().equals("title")) {
            xmlEvent = xmlEventReader.nextEvent();

            Current.title = xmlEvent.asCharacters().getData();
          }
          /** **********set id of page variable if found ************** */
          else if (startElement.getName().getLocalPart().toLowerCase().equals("id")) {
            xmlEvent = xmlEventReader.nextEvent();

            if (Current.id == 0) {
              Current.id = Long.parseLong(xmlEvent.asCharacters().getData());
            }
          }
          /** *********set the redirect title variable if found ********** */
          else if (startElement.getName().getLocalPart().toLowerCase().equals("redirect")) {
            nameRedirects++;
            // System.out.println("-------------------------------------");
            Iterator<Attribute> attributes = startElement.getAttributes();
            while (attributes.hasNext()) {
              Attribute attribute = attributes.next();
              if (attribute.getName().toString().equals("title")) {
                // System.out.println("redirect title " + attribute.getValue());
                Current.redirTitle = attribute.getValue();
                help = map.get(Current.redirTitle);
                if (!map.containsKey(Current.redirTitle) || help == null) {
                  help = new ReverseRedirects(false);
                  help.count++;
                  help.reversers = new String(Current.title);
                  map.put(Current.redirTitle, help);
                  deadRedirects++;
                  // System.out.println("mrtvych "+deadRedirects);
                } else {
                  // System.out.println("redir "+ Current.redirTitle);
                  help.count++;
                  if (help.reversers == null) help.reversers = new String(Current.title);
                  else help.reversers += (", " + Current.title);
                }
              }
            }
          }
          /**
           * *****if text element is found, search it for disambigious pages, or search it for links
           * to another pages, then write results
           */
          else if (startElement.getName().getLocalPart().toLowerCase().equals("text")) {
            String line = xmlEventReader.getElementText();
            Matcher m = r.matcher(line);
            if (line.toLowerCase().contains("{{rozlišovacia stránka}}")) {
              Current.ambig = true;

              help = map.get(Current.title);
              help.isDis = true;
              while (m.find()) {
                Current.numberRedirects++;
                allRedirects++;
                allDisambRedirects++;
                Current.redirects.add(m.group(1));

                help.countDisam++;
                String[] hlp = m.group(1).split("[|]");

                redirectHelp = new StringBuilder(hlp[0].replace(',', ' '));
                if (hlp.length < 2)
                  redirectHelp =
                      redirectHelp.delete(redirectHelp.length() - 2, redirectHelp.length());
                redirectHelp = redirectHelp.delete(0, 2);

                if (!map.containsKey(redirectHelp)) {
                  deadDisamb++;
                  if (help.deadDisambigs == null)
                    help.deadDisambigs = new String(redirectHelp.toString());
                  else help.deadDisambigs += ", " + redirectHelp.toString();
                }

                // System.out.println("lalalala ++++++++++ "+redirectHelp.toString());
                if (help.disambgis == null) help.disambgis = new String(redirectHelp.toString());
                else help.disambgis += ", " + redirectHelp.toString();
              }
              allDisamb++;
            } else {
              while (m.find()) {

                allRedirects++;
                Current.numberRedirects++;
                Current.redirects.add(m.group(1).replace(',', ' '));
              }
            }
          }
        }
      }
      sb = new StringBuilder();
      sb.append(i);
      if (Current.title != null) sb.append("," + Current.title);
      else sb.append(", ");
      if (Current.redirTitle != null) sb.append("," + Current.redirTitle);
      else sb.append(", ");
      if (Current.ambig) sb.append(",Y");
      else sb.append(",N");

      sb.append("," + Current.numberRedirects);
      for (int j = 0; j < Current.numberRedirects; j++) {
        sb.append("," + Current.redirects.get(j));
      }
      fileSave.addLine(sb.toString());

      end = System.currentTimeMillis();
      time = end - start;

    } catch (Exception e) {
      System.out.println("error: " + e);
      e.printStackTrace();
    }
    fileSave.close();
    dataSending Result = new dataSending();
    Result.allDisamb = allDisamb;
    Result.allDisambRedi = allDisambRedirects;
    Result.allRedirects = allRedirects;
    Result.deadDisambRedir = deadDisamb;
    Result.time = time;
    Result.nameRedirects = nameRedirects;
    Result.deadRedirects = deadRedirects;
    return Result;
  }