private void escribeArchivo(List<String> lista, String ruta) { try { BufferedWriter escritor = new BufferedWriter(new FileWriter(ruta)); for (String s : lista) { escritor.write(s); } escritor.close(); } catch (IOException ex) { Logger.getLogger(Reportador.class.getName()).log(Level.SEVERE, null, ex); } }
public static void main(String[] args) { try { DocumentBuilderFactory db = DocumentBuilderFactory.newInstance(); DocumentBuilder builder = db.newDocumentBuilder(); Document dom = builder.parse("data/geographic-area-data.html"); XPathFactory xpath = XPathFactory.newInstance(); XPath path = xpath.newXPath(); XPathExpression table = path.compile("//div[@id='mw-content-text']/table[contains(@class,'wikitable')]/tr"); NodeList wikiData = (NodeList) table.evaluate(dom, XPathConstants.NODESET); NodeList children; String currentData, cleanData; /* Open output stream */ FileWriter fstream = new FileWriter("data/parsed.yaml"); BufferedWriter out = new BufferedWriter(fstream); for (int i = 0; i < wikiData.getLength(); i++) { if (i == 0) { continue; } out.write(new Integer(i).toString() + ":\n"); children = wikiData.item(i).getChildNodes(); for (int j = 0; j < children.getLength(); j++) { currentData = (String) children.item(j).getTextContent(); switch (j) { case 0: /* Current Data is empty */ break; case 1: cleanData = decompose(currentData).trim().replaceAll("[^a-zA-Z\\s]+", ""); out.write("\t\"Geographic entity\": \"" + cleanData + "\"\n"); break; case 2: /* Current Data is empty */ break; case 3: cleanData = decompose(currentData).trim().replaceAll(",", ""); out.write("\t\"Area\": \"" + cleanData + "\"\n"); break; case 4: /* Current Data is empty */ break; case 5: cleanData = decompose(currentData).trim(); out.write("\t\"Notes\": \"" + cleanData + "\"\n"); break; case 6: /* Current Data is empty */ break; default: /* System.out.println("[" + j + "] Hit default case statement. Current Data is: " + currentData); */ break; } } } /* Close output stream */ out.close(); } catch (Exception e) { System.out.println(e); } }
private void processTxt(Node operation) { List<Node> targets = getChildNodes(operation, "target"); List<Node> optionNodes = getChildNodes(operation, "opt"); List<Node> separatorNode = getChildNodes(operation, "separator"); if (targets.isEmpty() || optionNodes.isEmpty()) { return; } String defaultSeparator = "="; String globalSeparator = defaultSeparator; if (!separatorNode.isEmpty()) { globalSeparator = separatorNode.get(0).getTextContent(); if (globalSeparator.length() != 1) { globalSeparator = defaultSeparator; } } Map<String, String> options = new HashMap<String, String>(); Map<String, String> processedOptions = new HashMap<String, String>(); for (int i = 0; i < optionNodes.size(); i++) { Node option = optionNodes.get(i); String name = option.getAttributes().getNamedItem("name").getNodeValue(); String value = option.getTextContent(); if (options.containsKey(name)) { options.remove(name); } options.put(name, value); } for (int t = 0; t < targets.size(); t++) { File target = new File(absolutePath(targets.get(t).getTextContent())); File tmpFile = new File(Utils.timestamp()); BufferedWriter bw = null; BufferedReader br = null; try { Node separatorAttr = targets.get(t).getAttributes().getNamedItem("separator"); String separator = (separatorAttr == null) ? globalSeparator : separatorAttr.getNodeValue(); if (separator.length() != 1) { separator = globalSeparator; } bw = new BufferedWriter(new FileWriter(tmpFile)); if (target.exists()) { br = new BufferedReader(new FileReader(target)); for (String line; (line = br.readLine()) != null; ) { String[] parts = line.split(separator); if (parts.length < 2) { bw.write(line); bw.newLine(); continue; } String optName = parts[0].trim(); if (options.containsKey(optName)) { String optValue = options.get(optName); bw.write(optName + " " + separator + " " + optValue); bw.newLine(); processedOptions.put(optName, optValue); options.remove(optName); } else if (processedOptions.containsKey(optName)) { bw.write(optName + " " + separator + " " + processedOptions.get(optName)); bw.newLine(); } else { bw.write(line); bw.newLine(); } } br.close(); } for (Map.Entry<String, String> entry : options.entrySet()) { bw.write(entry.getKey() + " " + separator + " " + entry.getValue()); bw.newLine(); } bw.close(); FileUtils.copyFile(tmpFile, target); FileUtils.forceDelete(tmpFile); } catch (IOException ex) { Utils.onError(new Error.WriteTxtConfig(target.getPath())); } } }
// ----------------------------------------------------- // Description: Using the specified template, create // csv records corresponding to the output xml criteria. // ----------------------------------------------------- void transformWithTemplate(String templateName, String fileName, String destinationDirectory) { System.out.println("fileName = " + fileName); String fileName_woext = fileName.substring(0, fileName.lastIndexOf('.')); try { File fXmlFile = new File(templateLocation, templateName); DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder dBuilder = dbFactory.newDocumentBuilder(); Document doc = dBuilder.parse(fXmlFile); doc.getDocumentElement().normalize(); System.out.println("Root element :" + doc.getDocumentElement().getNodeName()); NodeList nList = doc.getElementsByTagName("output"); // loop through all the output nodes for (int temp = 0; temp < nList.getLength(); temp++) { Node nNode = nList.item(temp); if (nNode.getNodeType() == Node.ELEMENT_NODE) { Element eElement = (Element) nNode; String outputLine = getTagValue("line", eElement); FileWriter output = new FileWriter( destinationDirectory + File.separator + fileName_woext.substring(0, fileName.lastIndexOf('_')) + ".csv", true); BufferedWriter bufWrite = new BufferedWriter(output); System.out.println( destinationDirectory + File.separator + fileName_woext.substring(0, fileName.lastIndexOf("_")) + ".csv"); if (Settings.optionalIdentifier != null) { bufWrite.write("\"" + Settings.optionalIdentifier + "\""); } String[] outputItems = outputLine.split("#"); for (String outputItem : outputItems) { if ((temp == 0) && (Settings.optionalIdentifier == null)) { bufWrite.write( "\"" + interpretText(destinationDirectory, fileName_woext, outputItem) + "\""); } else { bufWrite.write( ",\"" + interpretText(destinationDirectory, fileName_woext, outputItem) + "\""); } } bufWrite.write("\r\n"); bufWrite.close(); output.close(); } } } catch (Exception e) { e.printStackTrace(); } }
public void CrawlRT(String RTPage) throws IOException { ArrayList<String> t = new ArrayList<String>(); String crawlData; String crawlData2; String crawlData3; FileReader freader = new FileReader("Crawl.txt"); BufferedReader br = new BufferedReader(freader); FileReader freader2 = new FileReader("Tocrawl.txt"); BufferedReader br2 = new BufferedReader(freader2); FileWriter fwriter2 = new FileWriter("Tocrawl.txt", true); BufferedWriter bw2 = new BufferedWriter(fwriter2); FileWriter fwriter = new FileWriter("Crawl.txt", true); BufferedWriter bw = new BufferedWriter(fwriter); /*while(null != (crawlData2 = br.readLine())) { if(crawlData2 !=null) Crawl.add(crawlData2); } t = collectLinks(RTPage); Iterator<String> e3= t.iterator(); while(e3.hasNext()) { String ee = e3.next(); if(!Crawl.contains(ee)) { bw2.write(ee+"\r\n"); } } br.close(); br2.close(); bw.close(); bw2.close();*/ if (null == (crawlData = br.readLine())) // if(true) { // initial iteration bw.write(RTPage + "\r\n"); Crawl.add(RTPage); t = collectLinks(RTPage); ToCrawl.addAll(t); } else { // collect data from files and load to array lists while (null != (crawlData2 = br.readLine())) { if (crawlData2 != null) Crawl.add(crawlData2); } while (null != (crawlData3 = br2.readLine())) { if (crawlData3 != null) ToCrawl.add(crawlData3); } } System.out.println("Crawlled"); // Number of movies to be crawled for (int i = 0; i < 1000; i++) { if (ToCrawl.size() > 0) { Crawl.removeAll(Collections.singleton(null)); ToCrawl.removeAll(Collections.singleton(null)); String c = ToCrawl.get(0); if (Crawl.contains(c)) ToCrawl.remove(c); else { // collect links and collect data from a particular link Crawl.add(c); t = collectLinks(c); CollectData(c); ToCrawl.remove(c); Iterator<String> e3 = t.iterator(); while (e3.hasNext()) { String ee = e3.next(); if (!ToCrawl.contains(ee)) { if (!Crawl.contains(ee)) { ToCrawl.add(ee); } } } bw.write(c + "\r\n"); } } } System.out.println("To Be Crawlled"); Iterator<String> e2 = ToCrawl.iterator(); while (e2.hasNext()) { // write to file the movies still to be crawled. bw2.write(e2.next() + "\r\n"); } prop.setProperty("Id", Integer.toString(n)); prop.store(new FileOutputStream("config.properties"), null); br.close(); br2.close(); bw.close(); bw2.close(); }
public void CollectData(String link) { try { // Creating an empty XML Document DocumentBuilderFactory dbfac = DocumentBuilderFactory.newInstance(); DocumentBuilder docBuilder = dbfac.newDocumentBuilder(); Document doc = docBuilder.newDocument(); int flag = 0; // create the root element and add it to the document Element movie = doc.createElement("movie"); doc.appendChild(movie); movie.setAttribute("id", String.valueOf(n)); n++; // create sub elements Element genres = doc.createElement("genres"); Element actors = doc.createElement("actors"); Element reviews = doc.createElement("reviews"); URL movieUrl = new URL(link); URL reviewsURL = new URL(link + "reviews/#type=top_critics"); BufferedWriter bw3 = new BufferedWriter(new FileWriter("movies.xml", true)); int count = -1; String auth = ""; BufferedReader br3 = new BufferedReader(new InputStreamReader(movieUrl.openStream())); String str2 = ""; String info = ""; while (null != (str2 = br3.readLine())) { // start reading the html document if (str2.isEmpty()) continue; if (count == 14) break; if (count == 12) { if (!str2.contains("<h3>Cast</h3>")) continue; else count++; } if (count == 13) { if (str2.contains(">ADVERTISEMENT</p>")) { count++; movie.appendChild(actors); continue; } else { if (str2.contains("itemprop=\"name\">")) { Element actor = doc.createElement("actor"); actors.appendChild(actor); Text text = doc.createTextNode(Jsoup.parse(str2.toString()).text()); actor.appendChild(text); } else continue; } } if (count <= 11) { switch (count) { case -1: { if (!str2.contains("property=\"og:image\"")) continue; else { Pattern image = Pattern.compile("http://.*.jpg", Pattern.CASE_INSENSITIVE | Pattern.DOTALL); Matcher match = image.matcher(str2); while (match.find()) { Element imageLink = doc.createElement("imageLink"); movie.appendChild(imageLink); Text text = doc.createTextNode(match.group()); imageLink.appendChild(text); count++; } } break; } case 0: { if (str2.contains("<title>")) { Element name = doc.createElement("name"); movie.appendChild(name); Text text = doc.createTextNode( Jsoup.parse(str2.toString().replace(" - Rotten Tomatoes", "")).text()); name.appendChild(text); count++; } break; } case 1: { if (!str2.contains("itemprop=\"ratingValue\"")) break; else { Element score = doc.createElement("score"); movie.appendChild(score); Text text = doc.createTextNode(Jsoup.parse(str2.toString()).text()); score.appendChild(text); count++; } break; } case 2: { if (!str2.contains("itemprop=\"description\">")) continue; else count++; break; } case 3: { if (!str2.contains("itemprop=\"duration\"")) info = info.concat(str2); else { Element MovieInfo = doc.createElement("MovieInfo"); movie.appendChild(MovieInfo); Text text = doc.createTextNode(Jsoup.parse(info.toString()).text()); MovieInfo.appendChild(text); info = str2; count++; } break; } case 4: { if (!str2.contains("itemprop=\"genre\"")) info = info.concat(str2); else { Element duration = doc.createElement("duration"); movie.appendChild(duration); Text text = doc.createTextNode(Jsoup.parse(info.toString()).text()); duration.appendChild(text); info = str2; count++; } break; } case 5: { if (info.contains("itemprop=\"genre\"")) { Element genre = doc.createElement("genre"); genres.appendChild(genre); Text text = doc.createTextNode(Jsoup.parse(info.toString()).text()); genre.appendChild(text); info = ""; } if (str2.contains(">Directed By:<")) { count++; movie.appendChild(genres); continue; } else { if (str2.contains("itemprop=\"genre\"")) { Element genre = doc.createElement("genre"); genres.appendChild(genre); Text text = doc.createTextNode(Jsoup.parse(str2.toString()).text()); genre.appendChild(text); } else continue; } break; } case 6: { if (!str2.contains(">Written By:<")) { if (str2.contains(">In Theaters:<")) { Element director = doc.createElement("director"); movie.appendChild(director); Text text = doc.createTextNode( Jsoup.parse(info.toString().replace("Directed By: ", "")).text()); director.appendChild(text); info = str2; count += 2; break; } info = info.concat(str2); } else { Element director = doc.createElement("director"); movie.appendChild(director); Text text = doc.createTextNode( Jsoup.parse(info.toString().replace("Directed By: ", "")).text()); director.appendChild(text); info = ""; count++; } break; } case 7: { if (!str2.contains(">In Theaters:<")) { if (str2.contains(">On DVD:<")) { Element writer = doc.createElement("writer"); movie.appendChild(writer); Text text = doc.createTextNode(Jsoup.parse(info.toString()).text()); writer.appendChild(text); info = str2; count += 2; break; } info = info.concat(str2); } else { Element writer = doc.createElement("writer"); movie.appendChild(writer); Text text = doc.createTextNode(Jsoup.parse(info.toString()).text()); writer.appendChild(text); info = str2; count++; } break; } case 8: { if (!str2.contains(">On DVD:<")) info = info.concat(str2); else { Element TheatreRelease = doc.createElement("TheatreRelease"); movie.appendChild(TheatreRelease); Text text = doc.createTextNode( Jsoup.parse(info.toString().replace("In Theaters:", "")).text()); TheatreRelease.appendChild(text); info = str2; count++; } break; } case 9: { if (!str2.contains(">US Box Office:<")) { if (str2.contains("itemprop=\"productionCompany\"")) { Element DvdRelease = doc.createElement("DvdRelease"); movie.appendChild(DvdRelease); Text text = doc.createTextNode( Jsoup.parse(info.toString().replace("On DVD:", "")).text()); DvdRelease.appendChild(text); info = str2; count += 2; break; } info = info.concat(str2); } else { Element DvdRelease = doc.createElement("DvdRelease"); movie.appendChild(DvdRelease); Text text = doc.createTextNode( Jsoup.parse(info.toString().replace("On DVD:", "")).text()); DvdRelease.appendChild(text); info = str2; count++; } break; } case 10: { if (!str2.contains("itemprop=\"productionCompany\"")) info = info.concat(str2); else { Element BOCollection = doc.createElement("BOCollection"); movie.appendChild(BOCollection); Text text = doc.createTextNode( Jsoup.parse(info.toString().replace("US Box Office:", "")).text()); BOCollection.appendChild(text); info = str2; count++; } break; } case 11: { if (!str2.contains(">Official Site")) info = info.concat(str2); else { Element Production = doc.createElement("Production"); movie.appendChild(Production); Text text = doc.createTextNode(Jsoup.parse(info.toString()).text()); Production.appendChild(text); info = str2; count++; } break; } default: break; } } } BufferedReader br4 = new BufferedReader(new InputStreamReader(reviewsURL.openStream())); String str3 = ""; String info2 = ""; int count2 = 0; while (null != (str3 = br4.readLine())) { if (count2 == 0) { if (!str3.contains("<div class=\"reviewsnippet\">")) continue; else count2++; } if (count2 == 1) { if (!str3.contains("<p class=\"small subtle\">")) info2 = info2.concat(str3); else { Element review = doc.createElement("review"); reviews.appendChild(review); Text text = doc.createTextNode(Jsoup.parse(info2.toString()).text()); review.appendChild(text); info2 = ""; count2 = 0; } } } movie.appendChild(reviews); TransformerFactory transfac = TransformerFactory.newInstance(); Transformer trans = transfac.newTransformer(); trans.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); trans.setOutputProperty(OutputKeys.INDENT, "yes"); // create string from xml tree StringWriter sw = new StringWriter(); StreamResult result = new StreamResult(sw); DOMSource source = new DOMSource(doc); trans.transform(source, result); String xmlString = sw.toString(); bw3.write(xmlString); br3.close(); br4.close(); bw3.close(); } catch (Exception ex) { ex.printStackTrace(); } }