1
- try :
2
- from html .parser import HTMLParser # python 3.x
1
+ # -*- coding: utf-8 -*-
2
+
3
+ try : # python 3.x
4
+ from html .parser import HTMLParser
3
5
from html .entities import name2codepoint
4
6
from html import escape
5
- except ImportError :
6
- chr = unichr
7
7
8
- from HTMLParser import HTMLParser # python 2.x
8
+ basestring = str
9
+
10
+ except ImportError : # python 2.x
11
+ from HTMLParser import HTMLParser
9
12
from htmlentitydefs import name2codepoint
10
13
from cgi import escape
11
14
15
+ chr = unichr
16
+
12
17
from .exceptions import NotAllowedTag , InvalidHTML
13
18
14
19
18
23
'strong' , 'u' , 'ul' , 'video'
19
24
]
20
25
26
+ VOID_ELEMENTS = {
27
+ 'area' , 'base' , 'br' , 'col' , 'embed' , 'hr' , 'img' , 'input' , 'keygen' ,
28
+ 'link' , 'menuitem' , 'meta' , 'param' , 'source' , 'track' , 'wbr'
29
+ }
30
+
21
31
22
32
class HtmlToNodesParser (HTMLParser ):
23
33
def __init__ (self ):
24
34
HTMLParser .__init__ (self )
25
35
26
36
self .nodes = []
27
37
28
- self .current_node_list = self .nodes
29
- self .parent_node_lists = []
38
+ self .current_nodes = self .nodes
39
+ self .parent_nodes = []
40
+
41
+ def add_str_node (self , s ):
42
+ if self .current_nodes and isinstance (self .current_nodes [- 1 ], basestring ):
43
+ self .current_nodes [- 1 ] += s
44
+ else :
45
+ self .current_nodes .append (s )
30
46
31
47
def handle_starttag (self , tag , attrs_list ):
32
48
if tag not in ALLOWED_TAGS :
@@ -42,30 +58,38 @@ def handle_starttag(self, tag, attrs_list):
42
58
43
59
node ['attrs' ] = attrs
44
60
45
- self .current_node_list .append (node )
46
- self .parent_node_lists .append (self .current_node_list )
61
+ self .current_nodes .append (node )
62
+ self .parent_nodes .append (self .current_nodes )
47
63
48
- self .current_node_list = node ['children' ]
64
+ self .current_nodes = node ['children' ]
49
65
50
66
def handle_endtag (self , tag ):
51
- self .current_node_list = self .parent_node_lists .pop (- 1 )
67
+ self .current_nodes = self .parent_nodes .pop ()
52
68
53
- if self .current_node_list [- 1 ]['tag' ] != tag :
69
+ last_node = self .current_nodes [- 1 ]
70
+
71
+ if last_node ['tag' ] != tag :
54
72
raise InvalidHTML
55
73
74
+ if not last_node ['children' ]:
75
+ last_node .pop ('children' )
76
+
56
77
def handle_data (self , data ):
57
- self .current_node_list .append (data )
78
+ if data == '\n ' :
79
+ return
80
+
81
+ self .add_str_node (data )
58
82
59
83
def handle_entityref (self , name ):
60
- self .current_node_list . append (chr (name2codepoint [name ]))
84
+ self .add_str_node (chr (name2codepoint [name ]))
61
85
62
86
def handle_charref (self , name ):
63
87
if name .startswith ('x' ):
64
88
c = chr (int (name [1 :], 16 ))
65
89
else :
66
90
c = chr (int (name ))
67
91
68
- self .current_node_list . append (c )
92
+ self .add_str_node (c )
69
93
70
94
71
95
def html_to_nodes (html_content ):
@@ -80,7 +104,7 @@ def nodes_to_html(nodes):
80
104
81
105
stack = []
82
106
tags_stack = []
83
- current_nodes = nodes
107
+ current_nodes = nodes [:]
84
108
85
109
while True :
86
110
if current_nodes :
@@ -95,7 +119,7 @@ def nodes_to_html(nodes):
95
119
attrs_str = ['' ]
96
120
97
121
for attr , value in attrs .items ():
98
- attrs_str .append ('{}="{}"' .format (attr , value ))
122
+ attrs_str .append ('{}="{}"' .format (attr , escape ( value ) ))
99
123
else :
100
124
attrs_str = []
101
125
@@ -104,20 +128,28 @@ def nodes_to_html(nodes):
104
128
' ' .join (attrs_str )
105
129
))
106
130
107
- children = node .get ('children' )
108
-
109
- if children :
110
- stack .append (current_nodes )
111
- current_nodes = children
131
+ children = node .get ('children' , [])
132
+ stack .append (current_nodes )
133
+ current_nodes = children
112
134
else :
113
135
html_content .append (escape (node ))
114
136
115
137
if not current_nodes :
116
138
if tags_stack :
117
- html_content .append ('</{}>' .format (tags_stack .pop (- 1 )))
139
+ closed_tag = tags_stack .pop ()
140
+
141
+ last_el = html_content [- 1 ]
142
+
143
+ if closed_tag in VOID_ELEMENTS and \
144
+ last_el .startswith ('<{}' .format (closed_tag )) and \
145
+ not last_el .endswith ('/>' ):
146
+
147
+ html_content [- 1 ] = last_el [:- 1 ] + '/>'
148
+ else :
149
+ html_content .append ('</{}>' .format (closed_tag ))
118
150
119
151
if stack :
120
- current_nodes = stack .pop (- 1 )
152
+ current_nodes = stack .pop ()
121
153
else :
122
154
break
123
155
0 commit comments