@@ -10,14 +10,15 @@ import org.apache.commons.validator.routines.EmailValidator
10
10
import scala .collection .immutable .SortedSet
11
11
import scala .jdk .CollectionConverters ._
12
12
import scala .util .matching .Regex
13
+
13
14
import UrlDetector ._
14
15
15
16
final class UrlDetector private (
16
- options : UrlDetectorOptions ,
17
- allowed : Option [NonEmptySet [Host ]],
18
- denied : Option [NonEmptySet [Host ]],
19
- emailValidator : EmailValidator
20
- ) {
17
+ options : UrlDetectorOptions ,
18
+ allowed : Option [NonEmptySet [Host ]],
19
+ denied : Option [NonEmptySet [Host ]],
20
+ emailValidator : EmailValidator
21
+ ) {
21
22
22
23
private val allowedWithoutWww : Option [NonEmptySet [Host ]] =
23
24
allowed.flatMap(allowed => NonEmptySet .fromSet(allowed.toSortedSet.flatMap(removeWwwSubdomain)))
@@ -39,44 +40,11 @@ final class UrlDetector private (
39
40
.detect()
40
41
.asScala
41
42
.toList
42
- .map { lUrl =>
43
- val rawUrl = lUrl.toString
44
- val cleanedUrl = cleanUrlForBracketMatch(content, rawUrl)
45
- AbsoluteUrl .parse(sanitize(cleanedUrl))
46
- }
43
+ .map(url => AbsoluteUrl .parse(sanitize(cleanUrlForBracketMatch(content, url.toString))))
47
44
.filter(url => allowedUrl(url) && notEmail(url) && validTopLevelDomain(url))
48
45
.toSet
49
46
}
50
47
51
- private def cleanUrlForBracketMatch (originalText : String , urlStr : String ): String = {
52
- val allowedSpecialChars : Set [Char ] = Set (
53
- '-' , '.' , '_' , '~' , ':' , '/' , '?' , '#' , '[' , ']' , '@' ,
54
- '!' , '$' , '&' , '\' ' , '(' , ')' , '*' , '+' , ',' , ';' , '=' , '%'
55
- )
56
-
57
- def isAllowedUrlChar (c : Char ): Boolean = {
58
- c.isLetterOrDigit || allowedSpecialChars.contains(c)
59
- }
60
-
61
- val startIndexOpt = Option (originalText.indexOf(urlStr)).filter(_ >= 0 )
62
-
63
- startIndexOpt match {
64
- case None => urlStr
65
-
66
- case Some (startIndex) =>
67
- val extendedUrl = originalText
68
- .substring(startIndex)
69
- .takeWhile(isAllowedUrlChar)
70
-
71
- val emptyParensPattern = """ \(\)[^()]*""" .r
72
-
73
- emptyParensPattern.findFirstMatchIn(extendedUrl) match {
74
- case Some (m) => extendedUrl.substring(0 , m.start)
75
- case None => extendedUrl
76
- }
77
- }
78
- }
79
-
80
48
/**
81
49
* Method that creates a [[io.lambdaworks.detection.UrlDetector ]] with a set of hosts to allow.
82
50
*
@@ -119,6 +87,22 @@ final class UrlDetector private (
119
87
private def allowedUrl (url : AbsoluteUrl ): Boolean =
120
88
allowedWithoutWww.forall(containsHost(_, url)) && deniedWithoutWww.forall(! containsHost(_, url))
121
89
90
+ private def cleanUrlForBracketMatch (originalText : String , urlStr : String ): String = {
91
+ def isAllowedUrlChar (c : Char ): Boolean =
92
+ c.isLetterOrDigit || AllowedSpecialChars .contains(c)
93
+
94
+ Option (originalText.indexOf(urlStr)).filter(_ >= 0 ).fold(urlStr) { startIndex =>
95
+ val extendedUrl = originalText
96
+ .substring(startIndex)
97
+ .takeWhile(isAllowedUrlChar)
98
+
99
+ EmptyParensRegex
100
+ .findFirstMatchIn(extendedUrl)
101
+ .map(m => extendedUrl.substring(0 , m.start))
102
+ .getOrElse(extendedUrl)
103
+ }
104
+ }
105
+
122
106
private def containsHost (hosts : NonEmptySet [Host ], url : AbsoluteUrl ): Boolean =
123
107
hosts.exists(host => host.subdomain.fold(host.apexDomain.exists(url.apexDomain.contains))(_ => host == url.host))
124
108
@@ -164,7 +148,12 @@ object UrlDetector {
164
148
*/
165
149
lazy val default : UrlDetector = UrlDetector (UrlDetectorOptions .Default )
166
150
167
- private final val SanitizeRegex : Regex = " [,!-.`/]+$" .r
151
+ private final val AllowedSpecialChars : Set [Char ] = Set (
152
+ '-' , '.' , '_' , '~' , ':' , '/' , '?' , '#' , '[' , ']' , '@' , '!' , '$' , '&' , '\' ' , '(' , ')' , '*' , '+' , ',' , ';' , '=' , '%'
153
+ )
154
+
155
+ private final val EmptyParensRegex : Regex = """ \(\)[^()]*""" .r
156
+ private final val SanitizeRegex : Regex = " [,!-.`/]+$" .r
168
157
169
158
implicit private [detection] val orderingHost : Ordering [Host ] = orderHost.toOrdering
170
159
0 commit comments