private void handleCaseOfImage(HTML dequeuedHtml) { // TODO Auto-generated method stub int contentLength = extractContentLength(dequeuedHtml); if (contentLength != -1) { synchronized (m_ResultWrapper.m_ImageAggregator) { m_ResultWrapper.m_ImageAggregator.m_NumOfImages++; m_ResultWrapper.m_ImageAggregator.m_TotalSizeInBytes += contentLength; } } else { System.out.println(dequeuedHtml.GetHeader() + " will not be aggregated!!!"); } // int ContentLength = -1; // String regex = "Content-Length:\\s+([\\d]*)"; // Pattern pat = Pattern.compile(regex); // Matcher m = pat.matcher(dequeuedHtml.GetHeader()); // if (m.find()) { // try { // ContentLength = Integer.parseInt(m.group(1)); // System.out.println("content length is: " + ContentLength); // // TODO: saveSomewhere // synchronized (m_ResultWrapper.m_ImageAggregator) { // m_ResultWrapper.m_ImageAggregator.m_NumOfImages++; // m_ResultWrapper.m_ImageAggregator.m_TotalSizeInBytes += // ContentLength; // } // } catch (NumberFormatException e) { // // TODO: handle exception // System.out.println("Could not parse Content-Lenght header is malformed\n" // + dequeuedHtml.GetHeader()); // } // } }
private void HandleHTML(HTML dequeuedHtml, HTML.TypeOfHTML typeOfHtml) { int contentLenght = extractContentLength(dequeuedHtml); switch (typeOfHtml) { case TEXT: if (contentLenght != -1) { synchronized (m_ResultWrapper) { m_ResultWrapper.m_PageAggregator.m_NumOfPages++; m_ResultWrapper.m_PageAggregator.m_TotalSizeOfPagesInBytes += contentLenght; } } System.out.println("(Data of page was aggregated) content is text"); getImgLinks(dequeuedHtml); getUrlLinks(dequeuedHtml); // TODO: can links have // HTTP:// ? break; case IMAGE: if (contentLenght != -1) { synchronized (m_ResultWrapper) { m_ResultWrapper.m_ImageAggregator.m_NumOfImages++; m_ResultWrapper.m_ImageAggregator.m_TotalSizeInBytes += contentLenght; } } else { // TODO: Delete this, deubgging purposes System.out.println(dequeuedHtml.GetHeader() + " will not be aggregated!!!"); } break; default: break; } }
private int extractContentLength(HTML dequeuedHtml) { int ContentLength = -1; String regex = "Content-Length:\\s+([\\d]*)"; Pattern pat = Pattern.compile(regex); Matcher m = pat.matcher(dequeuedHtml.GetHeader()); if (m.find()) { try { ContentLength = Integer.parseInt(m.group(1)); System.out.println("content length is: " + ContentLength); } catch (NumberFormatException e) { // TODO: handle exception System.out.println( "Could not parse Content-Lenght, header is malformed\n" + dequeuedHtml.GetHeader()); } } return ContentLength; }
@Override public void run() { while (ThreadManager.isRunning) { // m_DownloadQueue.getSize() !=0 || // m_AnalyzeQueue.getSize() != 0) { // TODO: think about // this condition // HTML dequeuedHtml = m_AnalyzeQueue.dequeue(); ThreadManager.updateStateAnalyzer(true); HTML dequeuedHtml = m_AnalyzeQueue.dequeue(); if (dequeuedHtml != null) { ThreadManager.updateStateAnalyzer(false); System.out.println("Analyzer: dequeued: \n Header:\n" + dequeuedHtml.GetHeader()); try { String contentTypeOfHtml = getContentTypeFromHeader( dequeuedHtml.GetHeader()); // dequeuedHtml.GetContentType().toLowerCase(); if (contentTypeOfHtml.contains(IMAGE)) { // in case of image // handle case of image System.out.println("Analzyer: it is an image"); // TODO: check extension supported // handleCaseOfImage(dequeuedHtml); HandleHTML(dequeuedHtml, HTML.TypeOfHTML.IMAGE); } else if (contentTypeOfHtml.contains(VIDEO)) { // in case System.out.println("Analzyer: it is an video"); // of // Video // handle case of video } else if (contentTypeOfHtml.contains(DOCUMENT)) { // in // TODO: what is the command for document? <a href? <doc // src=? // case System.out.println("Analzyer: it is an document"); // // // of // Document // handle case of document } else if (contentTypeOfHtml.contains("text")) { HandleHTML(dequeuedHtml, HTML.TypeOfHTML.TEXT); // int contentLenght = // extractContentLength(dequeuedHtml); // TODO: should // be done in different method using enum // if (contentLenght != -1) { // synchronized (m_ResultWrapper) { // m_ResultWrapper.m_PageAggregator.m_NumOfPages++; // m_ResultWrapper.m_PageAggregator.m_TotalSizeOfPagesInBytes+= // contentLenght; // } // } // System.out.println("(Data of page was aggregated) content is text"); // getImgLinks(dequeuedHtml); // getUrlLinks(dequeuedHtml); // TODO: can links have // HTTP:// ? } else { System.out.println("Downloader: Error in content type of " + dequeuedHtml.toString()); } } catch (Exception e) { System.out.println("Couldnt get content type from: " + dequeuedHtml.GetHeader()); System.out.println(e.getMessage()); e.printStackTrace(); } } } System.out.println("Analyzer Finished!"); // m_DownloadQueue.unregisterProducer(); }