Repo created

This commit is contained in:
Fr4nz D13trich 2025-11-22 13:56:56 +01:00
parent 75dc487a7a
commit 39c29d175b
6317 changed files with 388324 additions and 2 deletions

View file

@ -0,0 +1,8 @@
plugins {
id(ThunderbirdPlugins.Library.jvm)
alias(libs.plugins.android.lint)
}
dependencies {
implementation(libs.jsoup)
}

View file

@ -0,0 +1,93 @@
package app.k9mail.html.cleaner
import org.jsoup.nodes.Document
import org.jsoup.safety.Cleaner
import org.jsoup.safety.Safelist
internal class BodyCleaner {
private val cleaner: Cleaner
private val allowedBodyAttributes = setOf(
"id", "class", "dir", "lang", "style",
"alink", "background", "bgcolor", "link", "text", "vlink",
)
init {
val allowList = Safelist.relaxed()
.addTags(
"font",
"hr",
"ins",
"del",
"center",
"map",
"area",
"title",
"tt",
"kbd",
"samp",
"var",
"style",
"s",
)
.addAttributes("font", "color", "face", "size")
.addAttributes("a", "name")
.addAttributes("div", "align")
.addAttributes(
"table",
"align",
"background",
"bgcolor",
"border",
"cellpadding",
"cellspacing",
"width",
)
.addAttributes("tr", "align", "background", "bgcolor", "valign")
.addAttributes(
"th",
"align", "background", "bgcolor", "colspan", "headers", "height", "nowrap", "rowspan", "scope",
"sorted", "valign", "width",
)
.addAttributes(
"td",
"align", "background", "bgcolor", "colspan", "headers", "height", "nowrap", "rowspan", "scope",
"valign", "width",
)
.addAttributes("map", "name")
.addAttributes("area", "shape", "coords", "href", "alt")
.addProtocols("area", "href", "http", "https")
.addAttributes("img", "usemap")
.addAttributes(":all", "class", "style", "id", "dir")
.addProtocols("img", "src", "http", "https", "cid", "data")
// Allow all URI schemes in links
.removeProtocols("a", "href", "ftp", "http", "https", "mailto")
cleaner = Cleaner(allowList)
}
fun clean(dirtyDocument: Document): Document {
val cleanedDocument = cleaner.clean(dirtyDocument)
copyDocumentType(dirtyDocument, cleanedDocument)
copyBodyAttributes(dirtyDocument, cleanedDocument)
return cleanedDocument
}
private fun copyDocumentType(dirtyDocument: Document, cleanedDocument: Document) {
dirtyDocument.documentType()?.let { documentType ->
cleanedDocument.insertChildren(0, documentType)
}
}
private fun copyBodyAttributes(dirtyDocument: Document, cleanedDocument: Document) {
val cleanedBody = cleanedDocument.body()
for (attribute in dirtyDocument.body().attributes()) {
if (attribute.key !in allowedBodyAttributes) continue
if (attribute.hasDeclaredValue()) {
cleanedBody.attr(attribute.key, attribute.value)
} else {
cleanedBody.attr(attribute.key, true)
}
}
}
}

View file

@ -0,0 +1,75 @@
package app.k9mail.html.cleaner
import org.jsoup.nodes.DataNode
import org.jsoup.nodes.Document
import org.jsoup.nodes.Element
import org.jsoup.nodes.Node
import org.jsoup.nodes.TextNode
import org.jsoup.parser.Tag
import org.jsoup.select.NodeTraversor
import org.jsoup.select.NodeVisitor
private val ALLOWED_TAGS = listOf("style", "meta", "base")
internal class HeadCleaner {
fun clean(dirtyDocument: Document, cleanedDocument: Document) {
copySafeNodes(dirtyDocument.head(), cleanedDocument.head())
}
private fun copySafeNodes(source: Element, destination: Element) {
val cleaningVisitor = CleaningVisitor(source, destination)
NodeTraversor.traverse(cleaningVisitor, source)
}
}
internal class CleaningVisitor(
private val root: Element,
private var destination: Element,
) : NodeVisitor {
private var elementToSkip: Element? = null
override fun head(source: Node, depth: Int) {
if (elementToSkip != null) return
if (source is Element) {
if (isSafeTag(source)) {
val sourceTag = source.tagName()
val destinationAttributes = source.attributes().clone()
val destinationChild = Element(Tag.valueOf(sourceTag), source.baseUri(), destinationAttributes)
destination.appendChild(destinationChild)
destination = destinationChild
} else if (source !== root) {
elementToSkip = source
}
} else if (source is TextNode) {
val destinationText = TextNode(source.wholeText)
destination.appendChild(destinationText)
} else if (source is DataNode && isSafeTag(source.parent())) {
val destinationData = DataNode(source.wholeData)
destination.appendChild(destinationData)
}
}
override fun tail(source: Node, depth: Int) {
if (source === elementToSkip) {
elementToSkip = null
} else if (source is Element && isSafeTag(source)) {
destination = destination.parent() ?: error("Missing parent")
}
}
private fun isSafeTag(node: Node?): Boolean {
if (node == null || isMetaRefresh(node)) return false
val tag = node.nodeName().lowercase()
return tag in ALLOWED_TAGS
}
private fun isMetaRefresh(node: Node): Boolean {
val tag = node.nodeName().lowercase()
if (tag != "meta") return false
val attributeValue = node.attributes().getIgnoreCase("http-equiv").trim().lowercase()
return attributeValue == "refresh"
}
}

View file

@ -0,0 +1,5 @@
package app.k9mail.html.cleaner
interface HtmlHeadProvider {
val headHtml: String
}

View file

@ -0,0 +1,25 @@
package app.k9mail.html.cleaner
import org.jsoup.nodes.Document
class HtmlProcessor(private val htmlHeadProvider: HtmlHeadProvider) {
private val htmlSanitizer = HtmlSanitizer()
fun processForDisplay(html: String): String {
return htmlSanitizer.sanitize(html)
.addCustomHeadContents()
.toCompactString()
}
private fun Document.addCustomHeadContents() = apply {
head().append(htmlHeadProvider.headHtml)
}
private fun Document.toCompactString(): String {
outputSettings()
.prettyPrint(false)
.indentAmount(0)
return html()
}
}

View file

@ -0,0 +1,16 @@
package app.k9mail.html.cleaner
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
internal class HtmlSanitizer {
private val headCleaner = HeadCleaner()
private val bodyCleaner = BodyCleaner()
fun sanitize(html: String): Document {
val dirtyDocument = Jsoup.parse(html)
val cleanedDocument = bodyCleaner.clean(dirtyDocument)
headCleaner.clean(dirtyDocument, cleanedDocument)
return cleanedDocument
}
}

View file

@ -0,0 +1,539 @@
package app.k9mail.html.cleaner
import assertk.assertThat
import assertk.assertions.isEqualTo
import org.jsoup.nodes.Document
import org.junit.Test
class HtmlSanitizerTest {
private val htmlSanitizer = HtmlSanitizer()
@Test
fun shouldRemoveMetaRefreshInHead() {
val html =
"""
<html>
<head><meta http-equiv="refresh" content="1; URL=http://example.com/"></head>
<body>Message</body>
</html>
""".trimIndent().trimLineBreaks()
val result = htmlSanitizer.sanitize(html)
assertThat(result.toCompactString()).isEqualTo("<html><head></head><body>Message</body></html>")
}
@Test
fun shouldRemoveMetaRefreshBetweenHeadAndBody() {
val html =
"""
<html>
<head></head>
<meta http-equiv="refresh" content="1; URL=http://example.com/">
<body>Message</body>
</html>
""".trimIndent().trimLineBreaks()
val result = htmlSanitizer.sanitize(html)
assertThat(result.toCompactString()).isEqualTo("<html><head></head><body>Message</body></html>")
}
@Test
fun shouldRemoveMetaRefreshInBody() {
val html =
"""
<html>
<head></head>
<body><meta http-equiv="refresh" content="1; URL=http://example.com/">Message</body>
</html>
""".trimIndent().trimLineBreaks()
val result = htmlSanitizer.sanitize(html)
assertThat(result.toCompactString()).isEqualTo("<html><head></head><body>Message</body></html>")
}
@Test
fun shouldRemoveMetaRefreshWithUpperCaseAttributeValue() {
val html =
"""
<html>
<head><meta http-equiv="REFRESH" content="1; URL=http://example.com/"></head>
<body>Message</body>
</html>
""".trimIndent().trimLineBreaks()
val result = htmlSanitizer.sanitize(html)
assertThat(result.toCompactString()).isEqualTo("<html><head></head><body>Message</body></html>")
}
@Test
fun shouldRemoveMetaRefreshWithMixedCaseAttributeValue() {
val html =
"""
<html>
<head><meta http-equiv="Refresh" content="1; URL=http://example.com/"></head>
<body>Message</body>
</html>
""".trimIndent().trimLineBreaks()
val result = htmlSanitizer.sanitize(html)
assertThat(result.toCompactString()).isEqualTo("<html><head></head><body>Message</body></html>")
}
@Test
fun shouldRemoveMetaRefreshWithoutQuotesAroundAttributeValue() {
val html =
"""
<html>
<head><meta http-equiv=refresh content="1; URL=http://example.com/"></head>
<body>Message</body>
</html>
""".trimIndent().trimLineBreaks()
val result = htmlSanitizer.sanitize(html)
assertThat(result.toCompactString()).isEqualTo("<html><head></head><body>Message</body></html>")
}
@Test
fun shouldRemoveMetaRefreshWithSpacesInAttributeValue() {
val html =
"""
<html>
<head><meta http-equiv="refresh " content="1; URL=http://example.com/"></head>
<body>Message</body>
</html>
""".trimIndent().trimLineBreaks()
val result = htmlSanitizer.sanitize(html)
assertThat(result.toCompactString()).isEqualTo("<html><head></head><body>Message</body></html>")
}
@Test
fun shouldRemoveMultipleMetaRefreshTags() {
val html =
"""
<html>
<head><meta http-equiv="refresh" content="1; URL=http://example.com/"></head>
<body><meta http-equiv="refresh" content="1; URL=http://example.com/">Message</body>
</html>
""".trimIndent().trimLineBreaks()
val result = htmlSanitizer.sanitize(html)
assertThat(result.toCompactString()).isEqualTo("<html><head></head><body>Message</body></html>")
}
@Test
fun shouldRemoveMetaRefreshButKeepOtherMetaTags() {
val html =
"""
<html>
<head>
<meta http-equiv="content-type" content="text/html; charset=UTF-8">
<meta http-equiv="refresh" content="1; URL=http://example.com/">
</head>
<body>Message</body>
</html>
""".trimIndent().trimLineBreaks()
val result = htmlSanitizer.sanitize(html)
assertThat(result.toCompactString()).isEqualTo(
"""
<html>
<head><meta http-equiv="content-type" content="text/html; charset=UTF-8"></head>
<body>Message</body>
</html>
""".trimIndent().trimLineBreaks(),
)
}
@Test
fun shouldProduceValidHtmlFromHtmlWithXmlDeclaration() {
val html =
"""
<?xml version="1.0" encoding="UTF-8"?>
<html>
<head></head>
<body></body>
</html>
""".trimIndent().trimLineBreaks()
val result = htmlSanitizer.sanitize(html)
assertThat(result.toCompactString()).isEqualTo("<html><head></head><body></body></html>")
}
@Test
fun shouldNormalizeTables() {
val html = "<html><head></head><body><table><tr><td></td><td></td></tr></table></body></html>"
val result = htmlSanitizer.sanitize(html)
assertThat(result.toCompactString()).isEqualTo(
"<html><head></head><body><table><tbody><tr><td></td><td></td></tr></tbody></table></body></html>",
)
}
@Test
fun shouldHtmlEncodeXmlDirectives() {
val html =
"""
<html>
<head></head>
<body>
<table><tr><td><!==><!==>Hmailserver service shutdown:</td><td><!==><!==>Ok</td></tr></table>
</body>
</html>
""".trimIndent().trimLineBreaks()
val result = htmlSanitizer.sanitize(html)
assertThat(result.toCompactString()).isEqualTo(
"""
<html>
<head></head>
<body><table><tbody><tr><td>Hmailserver service shutdown:</td><td>Ok</td></tr></tbody></table></body>
</html>
""".trimIndent().trimLineBreaks(),
)
}
@Test
fun shouldKeepHrTags() {
val html = "<html><head></head><body>one<hr>two<hr />three</body></html>"
val result = htmlSanitizer.sanitize(html)
assertThat(result.toCompactString()).isEqualTo("<html><head></head><body>one<hr>two<hr>three</body></html>")
}
@Test
fun shouldKeepInsDelTags() {
val html = "<html><head></head><body><ins>Inserted</ins><del>Deleted</del></body></html>"
val result = htmlSanitizer.sanitize(html)
assertThat(result.toCompactString()).isEqualTo(html)
}
@Test
fun shouldKeepMapAreaTags() {
val html =
"""
<html>
<head></head>
<body>
<map name="planetmap">
<area shape="rect" coords="0,0,82,126" href="http://domain.com/sun.htm" alt="Sun">
<area shape="circle" coords="90,58,3" href="http://domain.com/mercur.htm" alt="Mercury">
<area shape="circle" coords="124,58,8" href="http://domain.com/venus.htm" alt="Venus">
</map>
</body>
</html>
""".trimIndent().trimLineBreaks()
val result = htmlSanitizer.sanitize(html)
assertThat(result.toCompactString()).isEqualTo(html)
}
@Test
fun shouldKeepImgUsemap() {
val html =
"""
<html>
<head></head>
<body><img src="http://domain.com/image.jpg" usemap="#planetmap"></body>
</html>
""".trimIndent().trimLineBreaks()
val result = htmlSanitizer.sanitize(html)
assertThat(result.toCompactString()).isEqualTo(html)
}
@Test
fun shouldKeepAllowedElementsInHeadAndSkipTheRest() {
val html =
"""
<html>
<head>
<title>remove this</title>
<style>keep this</style>
<script>remove this</script>
</head>
</html>
""".trimIndent().trimLineBreaks()
val result = htmlSanitizer.sanitize(html)
assertThat(result.toCompactString())
.isEqualTo("<html><head><style>keep this</style></head><body></body></html>")
}
@Test
fun shouldRemoveIFrames() {
val html = """<html><body><iframe src="http://www.google.com" /></body></html>"""
val result = htmlSanitizer.sanitize(html)
assertThat(result.toCompactString()).isEqualTo("<html><head></head><body></body></html>")
}
@Test
fun shouldKeepFormattingTags() {
val html = """<html><body><center><font face="Arial" color="red" size="12">A</font></center></body></html>"""
val result = htmlSanitizer.sanitize(html)
assertThat(result.toCompactString()).isEqualTo(
"""
<html>
<head></head>
<body><center><font face="Arial" color="red" size="12">A</font></center></body>
</html>
""".trimIndent().trimLineBreaks(),
)
}
// This test will fail when jsoup updates its list of allowed "protocols" for the a.href attribute.
// When that happens, please adjust the removeProtocols("a", "href", …) line in BodyCleaner.
@Test
fun shouldKeepUris() {
val html =
"""
<html>
<body>
<a href="http://example.com/index.html">HTTP</a>
<a href="https://example.com/default.html">HTTPS</a>
<a href="mailto:user@example.com">Mailto</a>
<a href="tel:00442079460111">Telephone</a>
<a href="sms:00442079460111">SMS</a>
<a href="sip:user@example.com">SIP</a>
<a href="unknown:foobar">Unknown</a>
<a href="rtsp://example.com/media.mp4">RTSP</a>
</body>
</html>
""".trimIndent().trimLineBreaks()
val result = htmlSanitizer.sanitize(html)
assertThat(result.toCompactString()).isEqualTo(
"""
<html>
<head></head>
<body>
<a href="http://example.com/index.html">HTTP</a>
<a href="https://example.com/default.html">HTTPS</a>
<a href="mailto:user@example.com">Mailto</a>
<a href="tel:00442079460111">Telephone</a>
<a href="sms:00442079460111">SMS</a>
<a href="sip:user@example.com">SIP</a>
<a href="unknown:foobar">Unknown</a>
<a href="rtsp://example.com/media.mp4">RTSP</a>
</body>
</html>
""".trimIndent().trimLineBreaks(),
)
}
@Test
fun shouldKeepDirAttribute() {
val html =
"""
<html>
<head></head>
<body><table><tbody><tr><td dir="rtl"></td></tr></tbody></table></body>
</html>
""".trimIndent().trimLineBreaks()
val result = htmlSanitizer.sanitize(html)
assertThat(result.toCompactString()).isEqualTo(html)
}
@Test
fun shouldKeepAllowedBodyAttributes() {
val html =
"""
<html>
<body style="color: #fff" onload="alert()" class="body" id></body>
</html>
""".trimIndent().trimLineBreaks()
val result = htmlSanitizer.sanitize(html)
assertThat(result.toCompactString()).isEqualTo(
"""
<html>
<head></head>
<body style="color: #fff" class="body" id></body>
</html>
""".trimIndent().trimLineBreaks(),
)
}
@Test
fun `should keep HTML 5 doctype`() {
val html =
"""
<!doctype html>
<html><head></head><body>text</body></html>
""".trimIndent().trimLineBreaks()
val result = htmlSanitizer.sanitize(html)
assertThat(result.toCompactString()).isEqualTo(html)
}
@Test
fun `should keep HTML 4_01 doctype`() {
val html =
"""
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
<html><head></head><body>text</body></html>
""".trimIndent().trimLineBreaks()
val result = htmlSanitizer.sanitize(html)
assertThat(result.toCompactString()).isEqualTo(
"""
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
<html><head></head><body>text</body></html>
""".trimIndent().trimLineBreaks(),
)
}
@Test
fun `should keep 'align' attribute on 'div' element`() {
val html = """<div align="center">text</div>"""
val result = htmlSanitizer.sanitize(html)
assertThat(result.toCompactString()).isEqualTo(
"""
<html>
<head></head>
<body>
<div align="center">text</div>
</body>
</html>
""".trimIndent().trimLineBreaks(),
)
}
@Test
fun `should keep 'name' attribute on 'a' element`() {
val html = """<a name="something">"""
val result = htmlSanitizer.sanitize(html)
assertThat(result.toCompactString()).isEqualTo(
"""
<html>
<head></head>
<body>
<a name="something"></a>
</body>
</html>
""".trimIndent().trimLineBreaks(),
)
}
@Test
fun `should keep 'tt' element`() {
assertTagsNotStripped("tt")
}
@Test
fun `should keep 'kbd' element`() {
assertTagsNotStripped("kbd")
}
@Test
fun `should keep 'samp' element`() {
assertTagsNotStripped("samp")
}
@Test
fun `should keep 'var' element`() {
assertTagsNotStripped("var")
}
@Test
fun `should keep 's' element`() {
assertTagsNotStripped("s")
}
@Test
fun `should keep 'base' element`() {
val html =
"""
<html>
<head>
<base href="https://domain.example/">
</head>
<body>
<a href="relative">Link</a>
</body>
</html>
""".compactHtml()
val result = htmlSanitizer.sanitize(html)
assertThat(result.toCompactString()).isEqualTo(html)
}
@Test
fun `should keep 'style' element in body`() {
val html =
"""
<html>
<head></head>
<body>
<style>.test { color: #000 }</style>
</body>
</html>
""".compactHtml()
val result = htmlSanitizer.sanitize(html)
assertThat(result.toCompactString()).isEqualTo(html)
}
private fun assertTagsNotStripped(element: String) {
val html = """<$element>some text</$element>"""
val result = htmlSanitizer.sanitize(html)
assertThat(result.toCompactString()).isEqualTo(
"""
<html>
<head></head>
<body>
<$element>some text</$element>
</body>
</html>
""".trimIndent().trimLineBreaks(),
)
}
private fun Document.toCompactString(): String {
outputSettings()
.prettyPrint(false)
.indentAmount(0)
return html()
}
private fun String.trimLineBreaks() = replace("\n", "")
private fun String.compactHtml() = lines().joinToString(separator = "") { it.trim() }
}