public static void saveMapB( final File file, final Map<String, byte[]> props, final String comment) { HashMap<String, String> m = new HashMap<String, String>(); for (Map.Entry<String, byte[]> e : props.entrySet()) m.put(e.getKey(), UTF8.String(e.getValue())); saveMap(file, m, comment); }
public static ConcurrentHashMap<String, byte[]> loadMapB(final File f) { ConcurrentHashMap<String, String> m = loadMap(f); if (m == null) return null; ConcurrentHashMap<String, byte[]> mb = new ConcurrentHashMap<String, byte[]>(); for (Map.Entry<String, String> e : m.entrySet()) mb.put(e.getKey(), UTF8.getBytes(e.getValue())); return mb; }
/** * main - writes some data and checks the tables size (with time measureing) * * @param args */ public static void main(final String[] args) { // open a file, add one entry and exit final File f = new File(args[0]); if (f.exists()) FileUtils.deletedelete(f); try { final Records t = new Records(f, 8); final byte[] b = new byte[8]; t.add("01234567".getBytes(), 0); t.add("ABCDEFGH".getBytes(), 0); t.add("abcdefgh".getBytes(), 0); t.add("--------".getBytes(), 0); t.add("********".getBytes(), 0); for (int i = 0; i < 1000; i++) t.add("++++++++".getBytes(), 0); t.add("=======0".getBytes(), 0); t.add("=======1".getBytes(), 0); t.add("=======2".getBytes(), 0); t.cleanLast(b, 0); System.out.println(UTF8.String(b)); t.cleanLast(b, 0); // t.clean(2, b, 0); System.out.println(UTF8.String(b)); t.get(1, b, 0); System.out.println(UTF8.String(b)); t.put(1, "AbCdEfGh".getBytes(), 0); t.get(1, b, 0); System.out.println(UTF8.String(b)); t.get(3, b, 0); System.out.println(UTF8.String(b)); t.get(4, b, 0); System.out.println(UTF8.String(b)); System.out.println("size = " + t.size()); // t.clean(t.size() - 2); t.cleanLast(); final long start = System.currentTimeMillis(); long c = 0; for (int i = 0; i < 100000; i++) { c = t.size(); } System.out.println( "size() needs " + ((System.currentTimeMillis() - start) / 100) + " nanoseconds"); System.out.println("size = " + c); t.close(); } catch (final IOException e) { ConcurrentLog.logException(e); } }
@Override public Document[] parse( final DigestURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final int timezoneOffset, final InputStream source) throws Parser.Failure, InterruptedException { byte[] b = null; try { b = FileUtils.read(source); } catch (final IOException e1) { throw new Parser.Failure(e1.toString(), location); } final BDecoder bd = new BDecoder(b); final BObject bo = bd.parse(); if (bo == null) throw new Parser.Failure("BDecoder.parse returned null", location); if (bo.getType() != BType.dictionary) throw new Parser.Failure("BDecoder object is not a dictionary", location); final Map<String, BObject> map = bo.getMap(); final BObject commento = map.get("comment"); final String comment = (commento == null) ? "" : UTF8.String(commento.getString()); // Date creation = new Date(map.get("creation date").getInteger()); final BObject infoo = map.get("info"); final StringBuilder filenames = new StringBuilder(80); String title = ""; if (infoo != null) { final Map<String, BObject> info = infoo.getMap(); final BObject fileso = info.get("files"); if (fileso != null) { final List<BObject> filelist = fileso.getList(); for (final BObject fo : filelist) { final BObject patho = fo.getMap().get("path"); if (patho != null) { final List<BObject> l = patho.getList(); // one file may have several names for (final BObject fl : l) { filenames.append(fl.toString()).append(" "); } } } } final BObject nameo = info.get("name"); if (nameo != null) title = UTF8.String(nameo.getString()); } if (title == null || title.isEmpty()) title = MultiProtocolURL.unescape(location.getFileName()); return new Document[] { new Document( location, mimeType, charset, this, null, null, singleList(title), // title comment, // author location.getHost(), null, null, 0.0d, 0.0d, filenames.toString(), null, null, null, false, new Date()) }; }
@Override public Document[] parse( final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final int timezoneOffset, final InputStream source) throws Parser.Failure, InterruptedException { // check memory for parser if (!MemoryControl.request(200 * 1024 * 1024, false)) throw new Parser.Failure( "Not enough Memory available for pdf parser: " + MemoryControl.available(), location); // create a pdf parser PDDocument pdfDoc; try { Thread.currentThread().setPriority(Thread.MIN_PRIORITY); // the pdfparser is a big pain // pdfDoc = PDDocument.load(source); final PDFParser pdfParser = new PDFParser(source); pdfParser.setTempDirectory(new File(System.getProperty("java.io.tmpdir"))); pdfParser.parse(); pdfDoc = pdfParser.getPDDocument(); } catch (final IOException e) { throw new Parser.Failure(e.getMessage(), location); } finally { Thread.currentThread().setPriority(Thread.NORM_PRIORITY); } if (pdfDoc.isEncrypted()) { try { pdfDoc.openProtection(new StandardDecryptionMaterial("")); } catch (final BadSecurityHandlerException e) { try { pdfDoc.close(); } catch (final IOException ee) { } throw new Parser.Failure("Document is encrypted (1): " + e.getMessage(), location); } catch (final IOException e) { try { pdfDoc.close(); } catch (final IOException ee) { } throw new Parser.Failure("Document is encrypted (2): " + e.getMessage(), location); } catch (final CryptographyException e) { try { pdfDoc.close(); } catch (final IOException ee) { } throw new Parser.Failure("Document is encrypted (3): " + e.getMessage(), location); } final AccessPermission perm = pdfDoc.getCurrentAccessPermission(); if (perm == null || !perm.canExtractContent()) { try { pdfDoc.close(); } catch (final IOException ee) { } throw new Parser.Failure("Document is encrypted and cannot be decrypted", location); } } // extracting some metadata PDDocumentInformation info = pdfDoc.getDocumentInformation(); String docTitle = null, docSubject = null, docAuthor = null, docPublisher = null, docKeywordStr = null; Date docDate = new Date(); if (info != null) { docTitle = info.getTitle(); docSubject = info.getSubject(); docAuthor = info.getAuthor(); docPublisher = info.getProducer(); if (docPublisher == null || docPublisher.isEmpty()) docPublisher = info.getCreator(); docKeywordStr = info.getKeywords(); try { if (info.getModificationDate() != null) docDate = info.getModificationDate().getTime(); } catch (IOException e) { } // unused: // info.getTrapped()); } info = null; if (docTitle == null || docTitle.isEmpty()) { docTitle = MultiProtocolURL.unescape(location.getFileName()); } if (docTitle == null) { docTitle = docSubject; } String[] docKeywords = null; if (docKeywordStr != null) { docKeywords = docKeywordStr.split(" |,"); } Collection<AnchorURL>[] pdflinks = null; Document[] result = null; try { // get the links pdflinks = extractPdfLinks(pdfDoc); // get the fulltext (either per document or for each page) final PDFTextStripper stripper = new PDFTextStripper(StandardCharsets.UTF_8.name()); if (individualPages) { // this is a hack which stores individual pages of the source pdf into individual index // documents // the new documents will get a virtual link with a post argument page=X appended to the // original url // collect text int pagecount = pdfDoc.getNumberOfPages(); String[] pages = new String[pagecount]; for (int page = 1; page <= pagecount; page++) { stripper.setStartPage(page); stripper.setEndPage(page); pages[page - 1] = stripper.getText(pdfDoc); // System.out.println("PAGE " + page + ": " + pages[page - 1]); } // create individual documents for each page assert pages.length == pdflinks.length : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.length; result = new Document[Math.min(pages.length, pdflinks.length)]; String loc = location.toNormalform(true); for (int page = 0; page < result.length; page++) { result[page] = new Document( new AnchorURL( loc + (loc.indexOf('?') > 0 ? '&' : '?') + individualPagePropertyname + '=' + (page + 1)), // these are virtual new pages; we cannot combine them with '#' // as that would be removed when computing the urlhash mimeType, StandardCharsets.UTF_8.name(), this, null, docKeywords, singleList(docTitle), docAuthor, docPublisher, null, null, 0.0f, 0.0f, pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]), pdflinks == null || page >= pdflinks.length ? null : pdflinks[page], null, null, false, docDate); } } else { // collect the whole text at once final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE); byte[] contentBytes = new byte[0]; stripper.setEndPage(3); // get first 3 pages (always) writer.append(stripper.getText(pdfDoc)); contentBytes = writer.getBytes(); // remember text in case of interrupting thread if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text) stripper.setEndPage(Integer.MAX_VALUE); // set to default // we start the pdf parsing in a separate thread to ensure that it can be terminated final PDDocument pdfDocC = pdfDoc; final Thread t = new Thread() { @Override public void run() { Thread.currentThread().setName("pdfParser.getText:" + location); try { writer.append(stripper.getText(pdfDocC)); } catch (final Throwable e) { } } }; t.start(); t.join(3000); // pdfbox likes to forget to terminate ... (quite often) if (t.isAlive()) t.interrupt(); } contentBytes = writer.getBytes(); // get final text before closing writer Collection<AnchorURL> pdflinksCombined = new HashSet<AnchorURL>(); for (Collection<AnchorURL> pdflinksx : pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx); result = new Document[] { new Document( location, mimeType, StandardCharsets.UTF_8.name(), this, null, docKeywords, singleList(docTitle), docAuthor, docPublisher, null, null, 0.0f, 0.0f, contentBytes, pdflinksCombined, null, null, false, docDate) }; } } catch (final Throwable e) { // close the writer (in finally) // throw new Parser.Failure(e.getMessage(), location); } finally { try { pdfDoc.close(); } catch (final Throwable e) { } } // clear resources in pdfbox. they say that is resolved but it's not. see: // https://issues.apache.org/jira/browse/PDFBOX-313 // https://issues.apache.org/jira/browse/PDFBOX-351 // https://issues.apache.org/jira/browse/PDFBOX-441 // the pdfbox still generates enormeous number of object allocations and don't delete these // the following Object are statically stored and never flushed: // COSFloat, COSArray, COSInteger, COSObjectKey, COSObject, COSDictionary, // COSStream, COSString, COSName, COSDocument, COSInteger[], COSNull // the great number of these objects can easily be seen in Java Visual VM // we try to get this shit out of the memory here by forced clear calls, hope the best the // rubbish gets out. pdfDoc = null; clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes(); return result; }
/** * delete files and directories if a directory is not empty, delete also everything inside because * deletion sometimes fails on windows, there is also a windows exec included * * @param path */ public static void deletedelete(final File path) { if (path == null || !path.exists()) { return; } // empty the directory first if (path.isDirectory()) { final String[] list = path.list(); if (list != null) { for (final String s : list) { deletedelete(new File(path, s)); } } } if (path.exists()) path.delete(); /* int c = 0; while ( c++ < 20 ) { if ( !path.exists() ) { break; } if ( path.delete() ) { break; } // some OS may be slow when giving up file pointer //System.runFinalization(); //System.gc(); try { Thread.sleep(200); } catch (final InterruptedException e ) { break; } } */ if (path.exists()) { path.deleteOnExit(); String p = ""; try { p = path.getCanonicalPath(); } catch (final IOException e1) { ConcurrentLog.logException(e1); } if (System.getProperties().getProperty("os.name", "").toLowerCase().startsWith("windows")) { // deleting files on windows sometimes does not work with java try { final String command = "cmd /C del /F /Q \"" + p + "\""; final Process r = Runtime.getRuntime().exec(command); if (r == null) { ConcurrentLog.severe("FileUtils", "cannot execute command: " + command); } else { final byte[] response = read(r.getInputStream()); ConcurrentLog.info("FileUtils", "deletedelete: " + UTF8.String(response)); } } catch (final IOException e) { ConcurrentLog.logException(e); } } if (path.exists()) { ConcurrentLog.severe("FileUtils", "cannot delete file " + p); } } }
static final void sendRespondHeader( final HashMap<String, Object> conProp, final OutputStream respond, String httpVersion, final int httpStatusCode, String httpStatusText, ResponseHeader responseHeader) throws IOException { if (respond == null) throw new NullPointerException("The outputstream must not be null."); if (conProp == null) throw new NullPointerException("The connection property structure must not be null."); if (httpVersion == null) httpVersion = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HTTP_VER); if (httpVersion == null) httpVersion = HeaderFramework.HTTP_VERSION_1_1; if (responseHeader == null) responseHeader = new ResponseHeader(httpStatusCode); try { if ((httpStatusText == null) || (httpStatusText.length() == 0)) { if (HeaderFramework.http1_1.containsKey(Integer.toString(httpStatusCode))) // http1_1 includes http1_0 messages httpStatusText = HeaderFramework.http1_1.get(Integer.toString(httpStatusCode)); else httpStatusText = "Unknown"; } final StringBuilder header = new StringBuilder(560); // "HTTP/0.9" does not have a status line or header in the response if (!httpVersion.toUpperCase().equals(HeaderFramework.HTTP_VERSION_0_9)) { // write status line header .append(httpVersion) .append(" ") .append(Integer.toString(httpStatusCode)) .append(" ") .append(httpStatusText) .append("\r\n"); // prepare header if (!responseHeader.containsKey(HeaderFramework.DATE)) responseHeader.put(HeaderFramework.DATE, HeaderFramework.formatRFC1123(new Date())); if (!responseHeader.containsKey(HeaderFramework.CONTENT_TYPE)) responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html; charset=UTF-8"); // fix this if (!responseHeader.containsKey(RequestHeader.CONNECTION) && conProp.containsKey(HeaderFramework.CONNECTION_PROP_PERSISTENT)) responseHeader.put( RequestHeader.CONNECTION, (String) conProp.get(HeaderFramework.CONNECTION_PROP_PERSISTENT)); if (!responseHeader.containsKey(RequestHeader.PROXY_CONNECTION) && conProp.containsKey(HeaderFramework.CONNECTION_PROP_PERSISTENT)) responseHeader.put( RequestHeader.PROXY_CONNECTION, (String) conProp.get(HeaderFramework.CONNECTION_PROP_PERSISTENT)); if (conProp.containsKey(HeaderFramework.CONNECTION_PROP_PERSISTENT) && conProp.get(HeaderFramework.CONNECTION_PROP_PERSISTENT).equals("keep-alive") && !responseHeader.containsKey(HeaderFramework.TRANSFER_ENCODING) && !responseHeader.containsKey(HeaderFramework.CONTENT_LENGTH)) responseHeader.put(HeaderFramework.CONTENT_LENGTH, "0"); // read custom headers final Iterator<ResponseHeader.Entry> it = responseHeader.getAdditionalHeaderProperties().iterator(); ResponseHeader.Entry e; while (it.hasNext()) { // Append user properties to the main String // TODO: Should we check for user properites. What if they intersect properties that are // already in header? e = it.next(); header.append(e.getKey()).append(": ").append(e.getValue()).append("\r\n"); } // write header final Iterator<String> i = responseHeader.keySet().iterator(); String key; char tag; int count; while (i.hasNext()) { key = i.next(); tag = key.charAt(0); if ((tag != '*') && (tag != '#')) { // '#' in key is reserved for proxy attributes as artificial header // values count = responseHeader.keyCount(key); for (int j = 0; j < count; j++) { header .append(key) .append(": ") .append(responseHeader.getSingle(key, j)) .append("\r\n"); } } } // end header header.append("\r\n"); // sending headers to the client respond.write(UTF8.getBytes(header.toString())); // flush stream respond.flush(); } conProp.put(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_HEADER, responseHeader); conProp.put( HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_STATUS, Integer.toString(httpStatusCode)); } catch (final Exception e) { // any interruption may be caused be network error or because the user has closed // the windows during transmission. We simply pass it as IOException throw new IOException(e.getMessage()); } }
private static final void sendRespondError( final HashMap<String, Object> conProp, final OutputStream respond, final int errorcase, final int httpStatusCode, String httpStatusText, final String detailedErrorMsgText, final Object detailedErrorMsgFile, final serverObjects detailedErrorMsgValues, final Throwable stackTrace, ResponseHeader header) throws IOException { FileInputStream fis = null; ByteArrayOutputStream o = null; try { // setting the proper http status message String httpVersion = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HTTP_VER); if (httpVersion == null) httpVersion = "HTTP/1.1"; if ((httpStatusText == null) || (httpStatusText.length() == 0)) { // http1_1 includes http1_0 messages if (HeaderFramework.http1_1.containsKey(Integer.toString(httpStatusCode))) httpStatusText = HeaderFramework.http1_1.get(Integer.toString(httpStatusCode)); else httpStatusText = "Unknown"; } // generating the desired request url String host = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HOST); String path = (String) conProp.get(HeaderFramework.CONNECTION_PROP_PATH); if (path == null) path = "/"; final String args = (String) conProp.get(HeaderFramework.CONNECTION_PROP_ARGS); final String method = (String) conProp.get(HeaderFramework.CONNECTION_PROP_METHOD); final int port = Domains.stripToPort(host); host = Domains.stripToHostName(host); String urlString; try { urlString = (new DigestURL( (method.equals(HeaderFramework.METHOD_CONNECT) ? "https" : "http"), host, port, (args == null) ? path : path + "?" + args)) .toString(); } catch (final MalformedURLException e) { urlString = "invalid URL"; } // set rewrite values final serverObjects tp = new serverObjects(); String clientIP = (String) conProp.get(HeaderFramework.CONNECTION_PROP_CLIENTIP); if (clientIP == null) clientIP = Domains.LOCALHOST; tp.put("peerName", (switchboard.peers == null) ? "" : switchboard.peers.myName()); tp.put("errorMessageType", Integer.toString(errorcase)); tp.put("httpStatus", Integer.toString(httpStatusCode) + " " + httpStatusText); tp.put("requestMethod", (String) conProp.get(HeaderFramework.CONNECTION_PROP_METHOD)); tp.put("requestURL", urlString); switch (errorcase) { case ERRORCASE_FILE: tp.put( "errorMessageType_file", (detailedErrorMsgFile == null) ? "" : detailedErrorMsgFile.toString()); if ((detailedErrorMsgValues != null) && !detailedErrorMsgValues.isEmpty()) { // rewriting the value-names and add the proper name prefix: for (final Entry<String, String> entry : detailedErrorMsgValues.entrySet()) { tp.put("errorMessageType_" + entry.getKey(), entry.getValue()); } } break; case ERRORCASE_MESSAGE: default: tp.put( "errorMessageType_detailedErrorMsg", (detailedErrorMsgText == null) ? "" : detailedErrorMsgText.replaceAll("\n", "<br />")); break; } // building the stacktrace if (stackTrace != null) { tp.put("printStackTrace", "1"); final ByteBuffer errorMsg = new ByteBuffer(100); final PrintStream printStream = new PrintStream(errorMsg); stackTrace.printStackTrace(printStream); tp.put("printStackTrace_exception", stackTrace.toString()); tp.put("printStackTrace_stacktrace", UTF8.String(errorMsg.getBytes())); printStream.close(); } else { tp.put("printStackTrace", "0"); } // Generated Tue, 23 Aug 2005 11:19:14 GMT by brain.wg (squid/2.5.STABLE3) // adding some system information final String systemDate = HeaderFramework.formatRFC1123(new Date()); tp.put("date", systemDate); // rewrite the file final File htRootPath = new File( switchboard.getAppPath(), switchboard.getConfig( SwitchboardConstants.HTROOT_PATH, SwitchboardConstants.HTROOT_PATH_DEFAULT)); TemplateEngine.writeTemplate( "/proxymsg/error.html", fis = new FileInputStream(new File(htRootPath, "/proxymsg/error.html")), o = new ByteArrayOutputStream(512), tp); final byte[] result = o.toByteArray(); o.close(); o = null; if (header == null) header = new ResponseHeader(httpStatusCode); header.put( HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_STATUS, Integer.toString(httpStatusCode)); header.put(HeaderFramework.DATE, systemDate); header.put(HeaderFramework.CONTENT_TYPE, "text/html"); header.put(HeaderFramework.CONTENT_LENGTH, Integer.toString(result.length)); header.put(HeaderFramework.PRAGMA, "no-cache, no-store"); sendRespondHeader(conProp, respond, httpVersion, httpStatusCode, httpStatusText, header); if (!method.equals(HeaderFramework.METHOD_HEAD)) { // write the array to the client FileUtils.copy(result, respond); } respond.flush(); } finally { if (fis != null) try { fis.close(); } catch (final Exception e) { ConcurrentLog.logException(e); } if (o != null) try { o.close(); } catch (final Exception e) { ConcurrentLog.logException(e); } } }
@Override public Request pop(boolean delay, CrawlSwitchboard cs, RobotsTxt robots) throws IOException { // returns a crawl entry from the stack and ensures minimum delta times long sleeptime = 0; Request crawlEntry = null; CrawlProfile profileEntry = null; synchronized (this) { mainloop: while (true) { Index depthStack = getLowestStack(); if (depthStack == null) return null; Row.Entry rowEntry = null; while (depthStack.size() > 0) { rowEntry = depthStack.removeOne(); if (rowEntry != null) break; } if (rowEntry == null) continue mainloop; crawlEntry = new Request(rowEntry); // check blacklist (again) because the user may have created blacklist entries after the // queue has been filled if (Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, crawlEntry.url())) { if (log.isFine()) log.fine("URL '" + crawlEntry.url() + "' is in blacklist."); continue mainloop; } // at this point we must check if the crawlEntry has relevance because the crawl profile // still exists // if not: return null. A calling method must handle the null value and try again profileEntry = cs.get(UTF8.getBytes(crawlEntry.profileHandle())); if (profileEntry == null) { if (log.isFine()) log.fine("no profile entry for handle " + crawlEntry.profileHandle()); continue mainloop; } // depending on the caching policy we need sleep time to avoid DoS-like situations sleeptime = Latency.getDomainSleepTime(robots, profileEntry, crawlEntry.url()); break; } } if (crawlEntry == null) return null; ClientIdentification.Agent agent = profileEntry == null ? ClientIdentification.yacyInternetCrawlerAgent : profileEntry.getAgent(); long robotsTime = Latency.getRobotsTime(robots, crawlEntry.url(), agent); Latency.updateAfterSelection(crawlEntry.url(), profileEntry == null ? 0 : robotsTime); if (delay && sleeptime > 0) { // force a busy waiting here // in best case, this should never happen if the balancer works properly // this is only to protection against the worst case, where the crawler could // behave in a DoS-manner if (log.isInfo()) log.info( "forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ": " + Latency.waitingRemainingExplain(crawlEntry.url(), robots, agent)); long loops = sleeptime / 1000; long rest = sleeptime % 1000; if (loops < 3) { rest = rest + 1000 * loops; loops = 0; } Thread.currentThread() .setName( "Balancer waiting for " + crawlEntry.url().getHost() + ": " + sleeptime + " milliseconds"); synchronized (this) { // must be synchronized here to avoid 'takeover' moves from other threads which then idle // the same time which would not be enough if (rest > 0) { try { this.wait(rest); } catch (final InterruptedException e) { } } for (int i = 0; i < loops; i++) { if (log.isInfo()) log.info( "waiting for " + crawlEntry.url().getHost() + ": " + (loops - i) + " seconds remaining..."); try { this.wait(1000); } catch (final InterruptedException e) { } } } Latency.updateAfterSelection(crawlEntry.url(), robotsTime); } return crawlEntry; }