Skip to content

Commit 7df65a8

Browse files
Refactor based on review suggestions
1 parent 3525d82 commit 7df65a8

File tree

2 files changed

+47
-50
lines changed

2 files changed

+47
-50
lines changed

src/main/scala/io/lambdaworks/detection/UrlDetector.scala

Lines changed: 29 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,15 @@ import org.apache.commons.validator.routines.EmailValidator
1010
import scala.collection.immutable.SortedSet
1111
import scala.jdk.CollectionConverters._
1212
import scala.util.matching.Regex
13+
1314
import UrlDetector._
1415

1516
final class UrlDetector private (
16-
options: UrlDetectorOptions,
17-
allowed: Option[NonEmptySet[Host]],
18-
denied: Option[NonEmptySet[Host]],
19-
emailValidator: EmailValidator
20-
) {
17+
options: UrlDetectorOptions,
18+
allowed: Option[NonEmptySet[Host]],
19+
denied: Option[NonEmptySet[Host]],
20+
emailValidator: EmailValidator
21+
) {
2122

2223
private val allowedWithoutWww: Option[NonEmptySet[Host]] =
2324
allowed.flatMap(allowed => NonEmptySet.fromSet(allowed.toSortedSet.flatMap(removeWwwSubdomain)))
@@ -39,44 +40,11 @@ final class UrlDetector private (
3940
.detect()
4041
.asScala
4142
.toList
42-
.map { lUrl =>
43-
val rawUrl = lUrl.toString
44-
val cleanedUrl = cleanUrlForBracketMatch(content, rawUrl)
45-
AbsoluteUrl.parse(sanitize(cleanedUrl))
46-
}
43+
.map(url => AbsoluteUrl.parse(sanitize(cleanUrlForBracketMatch(content, url.toString))))
4744
.filter(url => allowedUrl(url) && notEmail(url) && validTopLevelDomain(url))
4845
.toSet
4946
}
5047

51-
private def cleanUrlForBracketMatch(originalText: String, urlStr: String): String = {
52-
val allowedSpecialChars: Set[Char] = Set(
53-
'-', '.', '_', '~', ':', '/', '?', '#', '[', ']', '@',
54-
'!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=', '%'
55-
)
56-
57-
def isAllowedUrlChar(c: Char): Boolean = {
58-
c.isLetterOrDigit || allowedSpecialChars.contains(c)
59-
}
60-
61-
val startIndexOpt = Option(originalText.indexOf(urlStr)).filter(_ >= 0)
62-
63-
startIndexOpt match {
64-
case None => urlStr
65-
66-
case Some(startIndex) =>
67-
val extendedUrl = originalText
68-
.substring(startIndex)
69-
.takeWhile(isAllowedUrlChar)
70-
71-
val emptyParensPattern = """\(\)[^()]*""".r
72-
73-
emptyParensPattern.findFirstMatchIn(extendedUrl) match {
74-
case Some(m) => extendedUrl.substring(0, m.start)
75-
case None => extendedUrl
76-
}
77-
}
78-
}
79-
8048
/**
8149
* Method that creates a [[io.lambdaworks.detection.UrlDetector]] with a set of hosts to allow.
8250
*
@@ -119,6 +87,22 @@ final class UrlDetector private (
11987
private def allowedUrl(url: AbsoluteUrl): Boolean =
12088
allowedWithoutWww.forall(containsHost(_, url)) && deniedWithoutWww.forall(!containsHost(_, url))
12189

90+
private def cleanUrlForBracketMatch(originalText: String, urlStr: String): String = {
91+
def isAllowedUrlChar(c: Char): Boolean =
92+
c.isLetterOrDigit || AllowedSpecialChars.contains(c)
93+
94+
Option(originalText.indexOf(urlStr)).filter(_ >= 0).fold(urlStr) { startIndex =>
95+
val extendedUrl = originalText
96+
.substring(startIndex)
97+
.takeWhile(isAllowedUrlChar)
98+
99+
EmptyParensRegex
100+
.findFirstMatchIn(extendedUrl)
101+
.map(m => extendedUrl.substring(0, m.start))
102+
.getOrElse(extendedUrl)
103+
}
104+
}
105+
122106
private def containsHost(hosts: NonEmptySet[Host], url: AbsoluteUrl): Boolean =
123107
hosts.exists(host => host.subdomain.fold(host.apexDomain.exists(url.apexDomain.contains))(_ => host == url.host))
124108

@@ -164,7 +148,12 @@ object UrlDetector {
164148
*/
165149
lazy val default: UrlDetector = UrlDetector(UrlDetectorOptions.Default)
166150

167-
private final val SanitizeRegex: Regex = "[,!-.`/]+$".r
151+
private final val AllowedSpecialChars: Set[Char] = Set(
152+
'-', '.', '_', '~', ':', '/', '?', '#', '[', ']', '@', '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=', '%'
153+
)
154+
155+
private final val EmptyParensRegex: Regex = """\(\)[^()]*""".r
156+
private final val SanitizeRegex: Regex = "[,!-.`/]+$".r
168157

169158
implicit private[detection] val orderingHost: Ordering[Host] = orderHost.toOrdering
170159

src/test/scala/io/lambdaworks/detection/UrlDetectorSpec.scala

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,11 @@ final class UrlDetectorSpec extends AnyFlatSpec with Matchers {
4545
),
4646
(
4747
"Parse https://learn.microsoft.com/en-us/previous-versions/windows/internet-explorer/ie-developer/platform-apis/aa752574(v=vs.85)?redirectedfrom=MSDN",
48-
Set(Url.parse("https://learn.microsoft.com/en-us/previous-versions/windows/internet-explorer/ie-developer/platform-apis/aa752574(v=vs.85)?redirectedfrom=MSDN"))
48+
Set(
49+
Url.parse(
50+
"https://learn.microsoft.com/en-us/previous-versions/windows/internet-explorer/ie-developer/platform-apis/aa752574(v=vs.85)?redirectedfrom=MSDN"
51+
)
52+
)
4953
),
5054
(
5155
"192.168.1.3 255.255.1.34 1234.34.34.5 0.0.0.0 192.168.1.257 2.3.4.5",
@@ -289,19 +293,23 @@ final class UrlDetectorSpec extends AnyFlatSpec with Matchers {
289293
Url.parse("http://test.link/g3WMrh"),
290294
Url.parse("http://test.link/HWRqhq"),
291295
Url.parse("http://test.link/KeKy")
292-
),
296+
)
293297
),
294-
(
295-
"Parse https://site.com/(v=1.2)",
296-
Set(Url.parse("https://site.com/(v=1.2"))
297-
),
298298
(
299-
"Parse https://learn.microsoft.com/en-us/previous-versions/windows/internet-explorer/ie-developer/platform-apis/aa752574(v=vs.85)?redirectedfrom=MSDN",
300-
Set(Url.parse("https://learn.microsoft.com/en-us/previous-versions/windows/internet-explorer/ie-developer/platform-apis/aa752574(v=vs.85)?redirectedfrom=MSDN"))
299+
"Parse https://site.com/(v=1.2)",
300+
Set(Url.parse("https://site.com/(v=1.2"))
301301
),
302302
(
303-
"Parse http://www.website.com/?utm_source=google%5BB%2B%5D&utm_medium=cpc&utm_content=google_ad(B)&utm_campaign=product",
304-
Set(Url.parse("http://www.website.com/?utm_source=google%5BB%2B%5D&utm_medium=cpc&utm_content=google_ad(B)&utm_campaign=product"))
303+
"Parse https://learn.microsoft.com/en-us/previous-versions/windows/internet-explorer/ie-developer/platform-apis/aa752574(v=vs.85)?redirectedfrom=MSDN " +
304+
"and http://www.website.com/?utm_source=google%5BB%2B%5D&utm_medium=cpc&utm_content=google_ad(B)&utm_campaign=product",
305+
Set(
306+
Url.parse(
307+
"https://learn.microsoft.com/en-us/previous-versions/windows/internet-explorer/ie-developer/platform-apis/aa752574(v=vs.85)?redirectedfrom=MSDN"
308+
),
309+
Url.parse(
310+
"http://www.website.com/?utm_source=google%5BB%2B%5D&utm_medium=cpc&utm_content=google_ad(B)&utm_campaign=product"
311+
)
312+
)
305313
)
306314
)
307315

0 commit comments

Comments
 (0)