public static AlignedSub moreCleanup(String str) { AlignedSub ret = new AlignedSub(str); ret = ret.replaceAll("&(amp|AMP);", "&"); ret = ret.replaceAll("&(lt|LT);", "<"); ret = ret.replaceAll("&(gt|GT);", ">"); return ret; }
/** some ACE docs have weird markup in them that serve as paragraph-ish markers * */ public static AlignedSub cleanupDocument(String document) { AlignedSub ret = new AlignedSub(document); ret = ret.replaceAll("<\\S+>", ""); ret = ret.replaceAll( leadingWhitespace, ""); // sentence breaker char offset correctness sensitive to this return ret; }