package gov.uspto.patent.doc.xml; import java.io.UnsupportedEncodingException; import java.nio.charset.StandardCharsets; import java.text.ParseException; import java.util.ArrayList; import java.util.Base64; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Document.OutputSettings; import org.jsoup.nodes.Document.OutputSettings.Syntax; import org.jsoup.nodes.Element; import org.jsoup.nodes.Entities.EscapeMode; import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; import org.jsoup.parser.Parser; import org.jsoup.parser.Tag; import org.jsoup.safety.Whitelist; import org.jsoup.select.Elements; import com.google.common.base.Charsets; import com.google.common.base.Strings; import com.google.common.collect.ImmutableSet; import gov.uspto.common.text.UnicodeUtil; import gov.uspto.patent.ReferenceTagger; import gov.uspto.patent.TextProcessor; import gov.uspto.patent.doc.simplehtml.FreetextConfig; import gov.uspto.patent.doc.simplehtml.HtmlToPlainText; /** * Parse and Clean Formated Text Fields, such as Description, Abstract and * Claims. * * @author Brian G. Feldman (brian.feldman@uspto.gov) * */ public class FormattedText implements TextProcessor { private static final Pattern TRAILING_REGEX = Pattern.compile("^\\s?(?:[.,;(])?([a-z])([.,;)]|\\b)"); private static final String[] HTML_WHITELIST_TAGS = new String[] { "br", "b", "sub", "sup", "sup2", "sub2", "h1", "h2", "h3", "h4", "h5", "h6", "p", "table", "tbody", "thead", "th", "tr", "td", "ul", "ol", "li", "dl", "dt", "dd", "a", "span", "colgroup", "col", "del", "ins", "q", "smallcaps", "o", "u" }; private static final String[] HTML_WHITELIST_ATTRIB = new String[] { "style", "class", "id", "idref", "num", "format", "type", "level", "width", "align", "valign", "rowspan" }; public static final ImmutableSet HEADER_ELEMENTS = ImmutableSet.of("heading", "p[id^=h-]"); public static final ImmutableSet TABLE_ELEMENTS = ImmutableSet.of("tr", "entry", "row", "table"); public static final ImmutableSet LIST_ELEMENTS = ImmutableSet.of("ul", "ol", "li", "dl", "dt", "dd"); @Override public String getPlainText(String rawText, FreetextConfig textConfig) { String simpleHtml = getSimpleHtml(rawText); Document simpleDoc = Jsoup.parse(simpleHtml, "", Parser.xmlParser()); HtmlToPlainText htmlConvert = new HtmlToPlainText(textConfig); return htmlConvert.getPlainText(simpleDoc); } @Override public String getSimpleHtml(String rawText) { /* * Change xml processing instruction "in-line-formulae" to normal xml node, as * it was in the Patent PAP format; also making it searchable within jsoup. */ rawText = rawText.replaceAll("<\\?in-line-formulae description=\"In-line Formulae\" end=\"lead\"\\?>", ""); rawText = rawText.replaceAll("<\\?in-line-formulae description=\"In-line Formulae\" end=\"end\"\\?>", ""); rawText = rawText.replaceAll("“", ""); rawText = rawText.replaceAll("”", ""); // Change xml processing instruction "delete" to normal xml tag "del" rawText = rawText.replaceAll("<\\?delete-start [^?>]+\\?>", "~~"); rawText = rawText.replaceAll("<\\?delete-end [^?>]+\\?>", "~~"); // Change xml processing instruction "insert" to normal xml tag "ins" rawText = rawText.replaceAll("<\\?insert-start [^?>]+\\?>", ""); rawText = rawText.replaceAll("<\\?insert-end [^?>]+\\?>", ""); Document document = Jsoup.parse("" + rawText + "", "", Parser.xmlParser()); document.outputSettings().prettyPrint(false).syntax(Syntax.xml).charset(StandardCharsets.UTF_16); //.escapeMode(EscapeMode.xhtml); document.select("bold").tagName("b"); /* * Overscore/Overline */ for (Element el : document.select("o")) { if (el.hasAttr("ostyle")) { el.attr("class", el.attr("ostyle")); } else { el.attr("class", el.attr("single")); } } Elements forEls = document.select("in-line-formula"); for (int i = 1; i <= forEls.size(); i++) { Element element = forEls.get(i - 1); element.attr("id", "FOR-" + Strings.padStart(String.valueOf(i), 4, '0')); element.tagName("span"); element.addClass("formula"); } /* * HEADING tags */ for (Element heading : document.select("heading")) { if (heading.hasAttr("lvl")) { heading.attr("level", heading.attr("lvl")); } heading.tagName("h2"); } // Header Paragraphs which have an id starting with "h-". document.select("p[id^=h-]").tagName("h4"); for (Element par : document.select("p")) { if (par.hasAttr("lvl")) { par.attr("level", par.attr("lvl")); } } /* * Patent Drawing FIGREF tags not needed in index. * * FIG. 1A * * FIG. 1A */ Elements figRefEls = document.select("figref"); for (int i = 1; i <= figRefEls.size(); i++) { Element element = figRefEls.get(i - 1); element.attr("id", "FR-" + Strings.padStart(String.valueOf(i), 4, '0')); element.attr("idref", ReferenceTagger.createFigId(element.text())); element.tagName("a"); element.addClass("figref"); } /* * Patent Claim Reference * * claim 1 * * claim 1 */ Elements clmRefEls = document.select("claim-ref"); for (int i = 1; i <= clmRefEls.size(); i++) { Element element = clmRefEls.get(i - 1); element.attr("id", "CR-" + Strings.padStart(String.valueOf(i), 4, '0')); element.tagName("a"); element.addClass("claim"); } /* * Patent Citation Reference * */ for (Element element : document.select("patcit")) { element.tagName("a"); element.addClass("patcite"); } /* * Patent Citation Reference */ for (Element element : document.select("nplcit")) { element.tagName("a"); element.addClass("nplcite"); } /* * crossref tags are internal links to another section. */ for (Element element : document.select("crossref")) { element.tagName("a"); element.addClass("crossref"); } fixFigrefLists(document); for (Element element : document.select("a")) { figrefMergeTrailing(element); } /* * Escape MathML math elements, to maintain all xml elements after sending * through Cleaner. */ boolean mathFound = false; Elements mathEls = document.select("math"); for (int i = 1; i <= mathEls.size(); i++) { Element element = mathEls.get(i - 1); mathFound = true; // String mathml = MathmlEscaper.escape(element.outerHtml()); String mathml = ""; try { mathml = Base64.getEncoder().encodeToString(element.outerHtml().getBytes("utf-8")); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } Element newEl = new Element(Tag.valueOf("span"), ""); newEl.attr("id", "MTH-" + Strings.padStart(String.valueOf(i), 4, '0')); newEl.addClass("math"); newEl.attr("format", "mathml"); newEl.appendChild(new TextNode(mathml)); element.replaceWith(newEl); figrefMergeTrailing(newEl); } /* * Subscript use unicode if able to convert */ for (Element el : document.select("sub")) { try { String unicode = UnicodeUtil.toSubscript(el.html()); el.replaceWith(new TextNode(unicode)); } catch (ParseException e) { // ignore. } } /* * Superscript use unicode if able to convert */ for (Element el : document.select("sup")) { try { String unicode = UnicodeUtil.toSuperscript(el.html()); el.replaceWith(new TextNode(unicode)); } catch (ParseException e) { // ignore. } } /* * List */ for (Element el : document.select("ul")) { if (el.hasAttr("list-style")) { if (el.attr("list-style").equals("none")) { el.attr("style", "list-style-type:none"); } else if (el.attr("list-style").equals("bullet")) { el.attr("style", "list-style-type:disc"); } else if (el.attr("list-style").equals("dash")) { el.attr("class", "ul-dash"); } } } for (Element el : document.select("ol")) { String type = el.hasAttr("style") ? el.attr("style") : el.attr("ol-style"); el.attr("type", type); } /* * Tables: Convert CALS Table to HTML Table */ Elements tableEls = document.select("table"); for (int i = 1; i <= tableEls.size(); i++) { Element element = tableEls.get(i - 1); element.attr("id", "TBL-" + Strings.padStart(String.valueOf(i), 4, '0')); Element colGroup = element.prependElement("colgroup"); for (Element spec : element.select("colspec")) { colGroup.appendElement("col").attr("width", spec.attr("colwidth")).attr("align", spec.attr("align")); } for (Element row : element.select("thead row")) { for (Element cell : row.select("entry")) { cell.tagName("th"); } row.tagName("tr"); } for (Element row : element.select("tbody row")) { for (Element cell : row.select("entry")) { String rowSpanSt = cell.attr("morerows"); int rowspan = !rowSpanSt.isEmpty() ? Integer.parseInt(rowSpanSt) + 1 : 1; if (rowspan > 1) { cell.attr("rowspan", String.valueOf(rowspan)); } cell.tagName("td"); } row.tagName("tr"); } } // document.select("p:matchesOwn((?is) +?)").remove(); // remove // paragraphs which contain only spaces. String docStr = document.html().replaceAll("\\s{2,}", " "); docStr = docStr.replaceAll("\\\\n", "\n"); Whitelist whitelist = Whitelist.none(); whitelist.addTags(HTML_WHITELIST_TAGS); whitelist.addAttributes(":all", HTML_WHITELIST_ATTRIB); OutputSettings outSettings = new Document.OutputSettings(); outSettings.charset(Charsets.UTF_16); outSettings.syntax(Syntax.xml); outSettings.outline(true); outSettings.prettyPrint(false); //outSettings.escapeMode(EscapeMode.xhtml); // outSettings.escapeMode(EscapeMode.extended); docStr = Jsoup.clean(docStr, "", whitelist, outSettings); if (mathFound) { // Reload document and un-base64 the mathml sections. document = Jsoup.parse("" + docStr + "", "", Parser.xmlParser()); document.outputSettings().prettyPrint(false).syntax(OutputSettings.Syntax.xml) .charset(StandardCharsets.UTF_16); for (Element el : document.select("span[class=math]")) { try { String html = new String(Base64.getDecoder().decode(el.html()), "utf-16"); el.text(""); el.append(html); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } } docStr = document.select("body").html(); } return docStr; } /* * Capture trailing non-space trailing text back to entity. * * FIG. 4a (FIG. 4a) FIG. 4a; FIG. 4 (a); * * ==> FIG. 4a * */ public void figrefMergeTrailing(Element element) { Node next = element.nextSibling(); String trailingTxt; if (next != null && next instanceof TextNode) { trailingTxt = ((TextNode) next).getWholeText(); } else if (next != null && next instanceof Element) { trailingTxt = ((Element) next).text(); } else { return; } Matcher matcher = TRAILING_REGEX.matcher(trailingTxt); if (matcher.matches()) { String trailingChar = matcher.group(1); String buffer = matcher.group(2); String tail = trailingTxt.substring(matcher.end() - buffer.length()).trim(); if (next instanceof Element) { Element nel = (Element) next; if (trailingTxt.length() == 1) { element.append(nel.outerHtml()); nel.remove(); } else { nel.text(tail); element.text(element.text() + trailingChar); } } else if (next instanceof TextNode) { TextNode txtNode = (TextNode) next; txtNode.text(tail); element.text(element.text() + trailingChar); } } } /* * Fix Figref Lists TRAILING * * FIGS. 1, 2 and 3c become * FIGS. 1, 2 and 3c */ public void fixFigrefLists(Document document) { for (Element element : document.select("a.figref")) { fixFigrefListItem(element); } } public void fixFigrefListItem(Element element) { Node next = element.nextSibling(); String trailingTxt; if (next != null && next instanceof TextNode) { trailingTxt = ((TextNode) next).getWholeText(); } else if (next != null && next instanceof Element) { trailingTxt = ((Element) next).text(); } else { return; } if (trailingTxt.matches("^(, |,? and )")) { next = element.nextSibling().nextSibling(); if (next.nodeName().toLowerCase().equals("b")) { String containedTxt = ((TextNode) next.childNode(0)).getWholeText(); if (containedTxt.matches("[0-9]{1,2}[A-z]?")) { Element newEl = element.clone(); newEl.attr("id", "FR-" + Strings.padStart(containedTxt, 4, '0')); newEl.attr("idref", ReferenceTagger.createFigId(containedTxt)); newEl.tagName("a"); newEl.addClass("figref"); newEl.text(containedTxt); next.replaceWith(newEl); fixFigrefListItem(newEl); } } } } @Override public List getParagraphText(String rawText) { String textWithPMarks = getSimpleHtml(rawText); Document jsoupDoc = Jsoup.parse(textWithPMarks, "", Parser.xmlParser()); List paragraphs = new ArrayList(); for (Element element : jsoupDoc.select("p")) { paragraphs.add(element.html()); } return paragraphs; } }