Beispiel #1
0
  private void escribeArchivo(List<String> lista, String ruta) {
    try {

      BufferedWriter escritor = new BufferedWriter(new FileWriter(ruta));

      for (String s : lista) {
        escritor.write(s);
      }

      escritor.close();

    } catch (IOException ex) {
      Logger.getLogger(Reportador.class.getName()).log(Level.SEVERE, null, ex);
    }
  }
  public static void main(String[] args) {
    try {
      DocumentBuilderFactory db = DocumentBuilderFactory.newInstance();
      DocumentBuilder builder = db.newDocumentBuilder();
      Document dom = builder.parse("data/geographic-area-data.html");
      XPathFactory xpath = XPathFactory.newInstance();
      XPath path = xpath.newXPath();
      XPathExpression table =
          path.compile("//div[@id='mw-content-text']/table[contains(@class,'wikitable')]/tr");
      NodeList wikiData = (NodeList) table.evaluate(dom, XPathConstants.NODESET);

      NodeList children;
      String currentData, cleanData;

      /* Open output stream */
      FileWriter fstream = new FileWriter("data/parsed.yaml");
      BufferedWriter out = new BufferedWriter(fstream);

      for (int i = 0; i < wikiData.getLength(); i++) {
        if (i == 0) {
          continue;
        }
        out.write(new Integer(i).toString() + ":\n");
        children = wikiData.item(i).getChildNodes();
        for (int j = 0; j < children.getLength(); j++) {
          currentData = (String) children.item(j).getTextContent();
          switch (j) {
            case 0:
              /* Current Data is empty */
              break;
            case 1:
              cleanData = decompose(currentData).trim().replaceAll("[^a-zA-Z\\s]+", "");
              out.write("\t\"Geographic entity\": \"" + cleanData + "\"\n");
              break;
            case 2:
              /* Current Data is empty */
              break;
            case 3:
              cleanData = decompose(currentData).trim().replaceAll(",", "");
              out.write("\t\"Area\": \"" + cleanData + "\"\n");
              break;
            case 4:
              /* Current Data is empty */
              break;
            case 5:
              cleanData = decompose(currentData).trim();
              out.write("\t\"Notes\": \"" + cleanData + "\"\n");
              break;
            case 6:
              /* Current Data is empty */
              break;
            default:
              /* System.out.println("[" + j + "] Hit default case statement. Current Data is: " + currentData); */
              break;
          }
        }
      }
      /* Close output stream */
      out.close();
    } catch (Exception e) {
      System.out.println(e);
    }
  }
Beispiel #3
0
  private void processTxt(Node operation) {
    List<Node> targets = getChildNodes(operation, "target");
    List<Node> optionNodes = getChildNodes(operation, "opt");
    List<Node> separatorNode = getChildNodes(operation, "separator");
    if (targets.isEmpty() || optionNodes.isEmpty()) {
      return;
    }
    String defaultSeparator = "=";
    String globalSeparator = defaultSeparator;
    if (!separatorNode.isEmpty()) {
      globalSeparator = separatorNode.get(0).getTextContent();
      if (globalSeparator.length() != 1) {
        globalSeparator = defaultSeparator;
      }
    }
    Map<String, String> options = new HashMap<String, String>();
    Map<String, String> processedOptions = new HashMap<String, String>();
    for (int i = 0; i < optionNodes.size(); i++) {
      Node option = optionNodes.get(i);
      String name = option.getAttributes().getNamedItem("name").getNodeValue();
      String value = option.getTextContent();
      if (options.containsKey(name)) {
        options.remove(name);
      }
      options.put(name, value);
    }
    for (int t = 0; t < targets.size(); t++) {
      File target = new File(absolutePath(targets.get(t).getTextContent()));
      File tmpFile = new File(Utils.timestamp());
      BufferedWriter bw = null;
      BufferedReader br = null;
      try {
        Node separatorAttr = targets.get(t).getAttributes().getNamedItem("separator");
        String separator = (separatorAttr == null) ? globalSeparator : separatorAttr.getNodeValue();
        if (separator.length() != 1) {
          separator = globalSeparator;
        }
        bw = new BufferedWriter(new FileWriter(tmpFile));
        if (target.exists()) {
          br = new BufferedReader(new FileReader(target));
          for (String line; (line = br.readLine()) != null; ) {
            String[] parts = line.split(separator);
            if (parts.length < 2) {
              bw.write(line);
              bw.newLine();
              continue;
            }

            String optName = parts[0].trim();
            if (options.containsKey(optName)) {
              String optValue = options.get(optName);
              bw.write(optName + " " + separator + " " + optValue);
              bw.newLine();
              processedOptions.put(optName, optValue);
              options.remove(optName);
            } else if (processedOptions.containsKey(optName)) {
              bw.write(optName + " " + separator + " " + processedOptions.get(optName));
              bw.newLine();
            } else {
              bw.write(line);
              bw.newLine();
            }
          }
          br.close();
        }
        for (Map.Entry<String, String> entry : options.entrySet()) {
          bw.write(entry.getKey() + " " + separator + " " + entry.getValue());
          bw.newLine();
        }
        bw.close();
        FileUtils.copyFile(tmpFile, target);
        FileUtils.forceDelete(tmpFile);
      } catch (IOException ex) {
        Utils.onError(new Error.WriteTxtConfig(target.getPath()));
      }
    }
  }
Beispiel #4
0
  // -----------------------------------------------------
  // Description: Using the specified template, create
  // csv records corresponding to the output xml criteria.
  // -----------------------------------------------------
  void transformWithTemplate(String templateName, String fileName, String destinationDirectory) {

    System.out.println("fileName = " + fileName);
    String fileName_woext = fileName.substring(0, fileName.lastIndexOf('.'));

    try {
      File fXmlFile = new File(templateLocation, templateName);
      DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
      DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
      Document doc = dBuilder.parse(fXmlFile);
      doc.getDocumentElement().normalize();

      System.out.println("Root element :" + doc.getDocumentElement().getNodeName());
      NodeList nList = doc.getElementsByTagName("output");

      // loop through all the output nodes
      for (int temp = 0; temp < nList.getLength(); temp++) {

        Node nNode = nList.item(temp);
        if (nNode.getNodeType() == Node.ELEMENT_NODE) {

          Element eElement = (Element) nNode;
          String outputLine = getTagValue("line", eElement);

          FileWriter output =
              new FileWriter(
                  destinationDirectory
                      + File.separator
                      + fileName_woext.substring(0, fileName.lastIndexOf('_'))
                      + ".csv",
                  true);
          BufferedWriter bufWrite = new BufferedWriter(output);

          System.out.println(
              destinationDirectory
                  + File.separator
                  + fileName_woext.substring(0, fileName.lastIndexOf("_"))
                  + ".csv");

          if (Settings.optionalIdentifier != null) {
            bufWrite.write("\"" + Settings.optionalIdentifier + "\"");
          }

          String[] outputItems = outputLine.split("#");
          for (String outputItem : outputItems) {
            if ((temp == 0) && (Settings.optionalIdentifier == null)) {
              bufWrite.write(
                  "\"" + interpretText(destinationDirectory, fileName_woext, outputItem) + "\"");
            } else {
              bufWrite.write(
                  ",\"" + interpretText(destinationDirectory, fileName_woext, outputItem) + "\"");
            }
          }

          bufWrite.write("\r\n");
          bufWrite.close();
          output.close();
        }
      }

    } catch (Exception e) {
      e.printStackTrace();
    }
  }
  public void CrawlRT(String RTPage) throws IOException {
    ArrayList<String> t = new ArrayList<String>();
    String crawlData;
    String crawlData2;
    String crawlData3;

    FileReader freader = new FileReader("Crawl.txt");
    BufferedReader br = new BufferedReader(freader);
    FileReader freader2 = new FileReader("Tocrawl.txt");
    BufferedReader br2 = new BufferedReader(freader2);
    FileWriter fwriter2 = new FileWriter("Tocrawl.txt", true);
    BufferedWriter bw2 = new BufferedWriter(fwriter2);
    FileWriter fwriter = new FileWriter("Crawl.txt", true);
    BufferedWriter bw = new BufferedWriter(fwriter);

    /*while(null != (crawlData2 = br.readLine()))
    {
    	if(crawlData2 !=null)
    		Crawl.add(crawlData2);
    }
    t = collectLinks(RTPage);
    Iterator<String> e3= t.iterator();
    while(e3.hasNext())
    {
    	String ee = e3.next();

    		if(!Crawl.contains(ee))
    		{
    			bw2.write(ee+"\r\n");
    		}



    }
    br.close();
    br2.close();
    bw.close();
    bw2.close();*/

    if (null == (crawlData = br.readLine()))
    // if(true)
    {
      // initial iteration
      bw.write(RTPage + "\r\n");
      Crawl.add(RTPage);
      t = collectLinks(RTPage);
      ToCrawl.addAll(t);
    } else {
      // collect data from files and load to array lists
      while (null != (crawlData2 = br.readLine())) {
        if (crawlData2 != null) Crawl.add(crawlData2);
      }

      while (null != (crawlData3 = br2.readLine())) {
        if (crawlData3 != null) ToCrawl.add(crawlData3);
      }
    }
    System.out.println("Crawlled");

    // Number of movies to be crawled
    for (int i = 0; i < 1000; i++) {
      if (ToCrawl.size() > 0) {
        Crawl.removeAll(Collections.singleton(null));
        ToCrawl.removeAll(Collections.singleton(null));
        String c = ToCrawl.get(0);
        if (Crawl.contains(c)) ToCrawl.remove(c);
        else {
          // collect links and collect data from a particular link
          Crawl.add(c);
          t = collectLinks(c);
          CollectData(c);
          ToCrawl.remove(c);
          Iterator<String> e3 = t.iterator();
          while (e3.hasNext()) {
            String ee = e3.next();
            if (!ToCrawl.contains(ee)) {
              if (!Crawl.contains(ee)) {
                ToCrawl.add(ee);
              }
            }
          }
          bw.write(c + "\r\n");
        }
      }
    }

    System.out.println("To Be Crawlled");
    Iterator<String> e2 = ToCrawl.iterator();
    while (e2.hasNext()) {
      // write to file the movies still to be crawled.
      bw2.write(e2.next() + "\r\n");
    }

    prop.setProperty("Id", Integer.toString(n));
    prop.store(new FileOutputStream("config.properties"), null);
    br.close();
    br2.close();
    bw.close();
    bw2.close();
  }
  public void CollectData(String link) {

    try {
      // Creating an empty XML Document

      DocumentBuilderFactory dbfac = DocumentBuilderFactory.newInstance();
      DocumentBuilder docBuilder = dbfac.newDocumentBuilder();
      Document doc = docBuilder.newDocument();
      int flag = 0;
      // create the root element and add it to the document
      Element movie = doc.createElement("movie");
      doc.appendChild(movie);
      movie.setAttribute("id", String.valueOf(n));
      n++;
      // create sub elements
      Element genres = doc.createElement("genres");
      Element actors = doc.createElement("actors");
      Element reviews = doc.createElement("reviews");

      URL movieUrl = new URL(link);
      URL reviewsURL = new URL(link + "reviews/#type=top_critics");
      BufferedWriter bw3 = new BufferedWriter(new FileWriter("movies.xml", true));
      int count = -1;
      String auth = "";
      BufferedReader br3 = new BufferedReader(new InputStreamReader(movieUrl.openStream()));
      String str2 = "";
      String info = "";
      while (null != (str2 = br3.readLine())) {
        // start reading the html document
        if (str2.isEmpty()) continue;
        if (count == 14) break;
        if (count == 12) {
          if (!str2.contains("<h3>Cast</h3>")) continue;
          else count++;
        }
        if (count == 13) {
          if (str2.contains(">ADVERTISEMENT</p>")) {
            count++;
            movie.appendChild(actors);
            continue;
          } else {
            if (str2.contains("itemprop=\"name\">")) {
              Element actor = doc.createElement("actor");
              actors.appendChild(actor);
              Text text = doc.createTextNode(Jsoup.parse(str2.toString()).text());
              actor.appendChild(text);
            } else continue;
          }
        }

        if (count <= 11) {
          switch (count) {
            case -1:
              {
                if (!str2.contains("property=\"og:image\"")) continue;
                else {
                  Pattern image =
                      Pattern.compile("http://.*.jpg", Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
                  Matcher match = image.matcher(str2);
                  while (match.find()) {

                    Element imageLink = doc.createElement("imageLink");
                    movie.appendChild(imageLink);
                    Text text = doc.createTextNode(match.group());
                    imageLink.appendChild(text);
                    count++;
                  }
                }
                break;
              }
            case 0:
              {
                if (str2.contains("<title>")) {

                  Element name = doc.createElement("name");
                  movie.appendChild(name);
                  Text text =
                      doc.createTextNode(
                          Jsoup.parse(str2.toString().replace(" - Rotten Tomatoes", "")).text());
                  name.appendChild(text);
                  count++;
                }
                break;
              }
            case 1:
              {
                if (!str2.contains("itemprop=\"ratingValue\"")) break;
                else {
                  Element score = doc.createElement("score");
                  movie.appendChild(score);
                  Text text = doc.createTextNode(Jsoup.parse(str2.toString()).text());
                  score.appendChild(text);
                  count++;
                }
                break;
              }
            case 2:
              {
                if (!str2.contains("itemprop=\"description\">")) continue;
                else count++;
                break;
              }
            case 3:
              {
                if (!str2.contains("itemprop=\"duration\"")) info = info.concat(str2);
                else {
                  Element MovieInfo = doc.createElement("MovieInfo");
                  movie.appendChild(MovieInfo);
                  Text text = doc.createTextNode(Jsoup.parse(info.toString()).text());
                  MovieInfo.appendChild(text);
                  info = str2;
                  count++;
                }
                break;
              }
            case 4:
              {
                if (!str2.contains("itemprop=\"genre\"")) info = info.concat(str2);
                else {
                  Element duration = doc.createElement("duration");
                  movie.appendChild(duration);
                  Text text = doc.createTextNode(Jsoup.parse(info.toString()).text());
                  duration.appendChild(text);
                  info = str2;
                  count++;
                }
                break;
              }
            case 5:
              {
                if (info.contains("itemprop=\"genre\"")) {
                  Element genre = doc.createElement("genre");
                  genres.appendChild(genre);
                  Text text = doc.createTextNode(Jsoup.parse(info.toString()).text());
                  genre.appendChild(text);
                  info = "";
                }
                if (str2.contains(">Directed By:<")) {
                  count++;
                  movie.appendChild(genres);
                  continue;
                } else {

                  if (str2.contains("itemprop=\"genre\"")) {
                    Element genre = doc.createElement("genre");
                    genres.appendChild(genre);
                    Text text = doc.createTextNode(Jsoup.parse(str2.toString()).text());
                    genre.appendChild(text);
                  } else continue;
                }
                break;
              }
            case 6:
              {
                if (!str2.contains(">Written By:<")) {
                  if (str2.contains(">In Theaters:<")) {
                    Element director = doc.createElement("director");
                    movie.appendChild(director);
                    Text text =
                        doc.createTextNode(
                            Jsoup.parse(info.toString().replace("Directed By: ", "")).text());
                    director.appendChild(text);
                    info = str2;
                    count += 2;
                    break;
                  }
                  info = info.concat(str2);
                } else {
                  Element director = doc.createElement("director");
                  movie.appendChild(director);
                  Text text =
                      doc.createTextNode(
                          Jsoup.parse(info.toString().replace("Directed By: ", "")).text());
                  director.appendChild(text);
                  info = "";
                  count++;
                }
                break;
              }
            case 7:
              {
                if (!str2.contains(">In Theaters:<")) {
                  if (str2.contains(">On DVD:<")) {
                    Element writer = doc.createElement("writer");
                    movie.appendChild(writer);
                    Text text = doc.createTextNode(Jsoup.parse(info.toString()).text());
                    writer.appendChild(text);
                    info = str2;
                    count += 2;
                    break;
                  }
                  info = info.concat(str2);
                } else {
                  Element writer = doc.createElement("writer");
                  movie.appendChild(writer);
                  Text text = doc.createTextNode(Jsoup.parse(info.toString()).text());
                  writer.appendChild(text);
                  info = str2;
                  count++;
                }
                break;
              }
            case 8:
              {
                if (!str2.contains(">On DVD:<")) info = info.concat(str2);
                else {
                  Element TheatreRelease = doc.createElement("TheatreRelease");
                  movie.appendChild(TheatreRelease);
                  Text text =
                      doc.createTextNode(
                          Jsoup.parse(info.toString().replace("In Theaters:", "")).text());
                  TheatreRelease.appendChild(text);
                  info = str2;
                  count++;
                }
                break;
              }
            case 9:
              {
                if (!str2.contains(">US Box Office:<")) {
                  if (str2.contains("itemprop=\"productionCompany\"")) {
                    Element DvdRelease = doc.createElement("DvdRelease");
                    movie.appendChild(DvdRelease);
                    Text text =
                        doc.createTextNode(
                            Jsoup.parse(info.toString().replace("On DVD:", "")).text());
                    DvdRelease.appendChild(text);
                    info = str2;
                    count += 2;
                    break;
                  }
                  info = info.concat(str2);
                } else {
                  Element DvdRelease = doc.createElement("DvdRelease");
                  movie.appendChild(DvdRelease);
                  Text text =
                      doc.createTextNode(
                          Jsoup.parse(info.toString().replace("On DVD:", "")).text());
                  DvdRelease.appendChild(text);
                  info = str2;
                  count++;
                }
                break;
              }
            case 10:
              {
                if (!str2.contains("itemprop=\"productionCompany\"")) info = info.concat(str2);
                else {
                  Element BOCollection = doc.createElement("BOCollection");
                  movie.appendChild(BOCollection);
                  Text text =
                      doc.createTextNode(
                          Jsoup.parse(info.toString().replace("US Box Office:", "")).text());
                  BOCollection.appendChild(text);
                  info = str2;
                  count++;
                }
                break;
              }
            case 11:
              {
                if (!str2.contains(">Official Site")) info = info.concat(str2);
                else {
                  Element Production = doc.createElement("Production");
                  movie.appendChild(Production);
                  Text text = doc.createTextNode(Jsoup.parse(info.toString()).text());
                  Production.appendChild(text);
                  info = str2;
                  count++;
                }
                break;
              }

            default:
              break;
          }
        }
      }
      BufferedReader br4 = new BufferedReader(new InputStreamReader(reviewsURL.openStream()));
      String str3 = "";
      String info2 = "";
      int count2 = 0;
      while (null != (str3 = br4.readLine())) {
        if (count2 == 0) {

          if (!str3.contains("<div class=\"reviewsnippet\">")) continue;
          else count2++;
        }
        if (count2 == 1) {
          if (!str3.contains("<p class=\"small subtle\">")) info2 = info2.concat(str3);
          else {
            Element review = doc.createElement("review");
            reviews.appendChild(review);
            Text text = doc.createTextNode(Jsoup.parse(info2.toString()).text());
            review.appendChild(text);
            info2 = "";
            count2 = 0;
          }
        }
      }
      movie.appendChild(reviews);
      TransformerFactory transfac = TransformerFactory.newInstance();
      Transformer trans = transfac.newTransformer();
      trans.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
      trans.setOutputProperty(OutputKeys.INDENT, "yes");

      // create string from xml tree
      StringWriter sw = new StringWriter();
      StreamResult result = new StreamResult(sw);
      DOMSource source = new DOMSource(doc);
      trans.transform(source, result);
      String xmlString = sw.toString();
      bw3.write(xmlString);
      br3.close();
      br4.close();
      bw3.close();
    } catch (Exception ex) {
      ex.printStackTrace();
    }
  }