Skip to content

Commit b561014

Browse files
committed
Fix HtmlToNodesParser and html_to_nodes
1 parent deea8ac commit b561014

File tree

2 files changed

+76
-54
lines changed

2 files changed

+76
-54
lines changed

telegraph/utils.py

Lines changed: 56 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,19 @@
1-
try:
2-
from html.parser import HTMLParser # python 3.x
1+
# -*- coding: utf-8 -*-
2+
3+
try: # python 3.x
4+
from html.parser import HTMLParser
35
from html.entities import name2codepoint
46
from html import escape
5-
except ImportError:
6-
chr = unichr
77

8-
from HTMLParser import HTMLParser # python 2.x
8+
basestring = str
9+
10+
except ImportError: # python 2.x
11+
from HTMLParser import HTMLParser
912
from htmlentitydefs import name2codepoint
1013
from cgi import escape
1114

15+
chr = unichr
16+
1217
from .exceptions import NotAllowedTag, InvalidHTML
1318

1419

@@ -18,15 +23,26 @@
1823
'strong', 'u', 'ul', 'video'
1924
]
2025

26+
VOID_ELEMENTS = {
27+
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen',
28+
'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr'
29+
}
30+
2131

2232
class HtmlToNodesParser(HTMLParser):
2333
def __init__(self):
2434
HTMLParser.__init__(self)
2535

2636
self.nodes = []
2737

28-
self.current_node_list = self.nodes
29-
self.parent_node_lists = []
38+
self.current_nodes = self.nodes
39+
self.parent_nodes = []
40+
41+
def add_str_node(self, s):
42+
if self.current_nodes and isinstance(self.current_nodes[-1], basestring):
43+
self.current_nodes[-1] += s
44+
else:
45+
self.current_nodes.append(s)
3046

3147
def handle_starttag(self, tag, attrs_list):
3248
if tag not in ALLOWED_TAGS:
@@ -42,30 +58,38 @@ def handle_starttag(self, tag, attrs_list):
4258

4359
node['attrs'] = attrs
4460

45-
self.current_node_list.append(node)
46-
self.parent_node_lists.append(self.current_node_list)
61+
self.current_nodes.append(node)
62+
self.parent_nodes.append(self.current_nodes)
4763

48-
self.current_node_list = node['children']
64+
self.current_nodes = node['children']
4965

5066
def handle_endtag(self, tag):
51-
self.current_node_list = self.parent_node_lists.pop(-1)
67+
self.current_nodes = self.parent_nodes.pop()
5268

53-
if self.current_node_list[-1]['tag'] != tag:
69+
last_node = self.current_nodes[-1]
70+
71+
if last_node['tag'] != tag:
5472
raise InvalidHTML
5573

74+
if not last_node['children']:
75+
last_node.pop('children')
76+
5677
def handle_data(self, data):
57-
self.current_node_list.append(data)
78+
if data == '\n':
79+
return
80+
81+
self.add_str_node(data)
5882

5983
def handle_entityref(self, name):
60-
self.current_node_list.append(chr(name2codepoint[name]))
84+
self.add_str_node(chr(name2codepoint[name]))
6185

6286
def handle_charref(self, name):
6387
if name.startswith('x'):
6488
c = chr(int(name[1:], 16))
6589
else:
6690
c = chr(int(name))
6791

68-
self.current_node_list.append(c)
92+
self.add_str_node(c)
6993

7094

7195
def html_to_nodes(html_content):
@@ -80,7 +104,7 @@ def nodes_to_html(nodes):
80104

81105
stack = []
82106
tags_stack = []
83-
current_nodes = nodes
107+
current_nodes = nodes[:]
84108

85109
while True:
86110
if current_nodes:
@@ -95,7 +119,7 @@ def nodes_to_html(nodes):
95119
attrs_str = ['']
96120

97121
for attr, value in attrs.items():
98-
attrs_str.append('{}="{}"'.format(attr, value))
122+
attrs_str.append('{}="{}"'.format(attr, escape(value)))
99123
else:
100124
attrs_str = []
101125

@@ -104,20 +128,28 @@ def nodes_to_html(nodes):
104128
' '.join(attrs_str)
105129
))
106130

107-
children = node.get('children')
108-
109-
if children:
110-
stack.append(current_nodes)
111-
current_nodes = children
131+
children = node.get('children', [])
132+
stack.append(current_nodes)
133+
current_nodes = children
112134
else:
113135
html_content.append(escape(node))
114136

115137
if not current_nodes:
116138
if tags_stack:
117-
html_content.append('</{}>'.format(tags_stack.pop(-1)))
139+
closed_tag = tags_stack.pop()
140+
141+
last_el = html_content[-1]
142+
143+
if closed_tag in VOID_ELEMENTS and \
144+
last_el.startswith('<{}'.format(closed_tag)) and \
145+
not last_el.endswith('/>'):
146+
147+
html_content[-1] = last_el[:-1] + '/>'
148+
else:
149+
html_content.append('</{}>'.format(closed_tag))
118150

119151
if stack:
120-
current_nodes = stack.pop(-1)
152+
current_nodes = stack.pop()
121153
else:
122154
break
123155

tests/test_html_converter.py

Lines changed: 20 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,48 +1,44 @@
1-
import sys
21
from unittest import TestCase
32

43
from telegraph.exceptions import NotAllowedTag, InvalidHTML
54
from telegraph.utils import html_to_nodes, nodes_to_html
65

76
HTML_TEST_STR = """
8-
<p>Hello, world!</p>
7+
<p>Hello, world!<br/></p>
98
<p><a href="https://telegra.ph/">Test link&lt;/a&gt;</a></p>
9+
<figure>
10+
<img src="/file/6c2ecfdfd6881d37913fa.png"/>
11+
<figcaption></figcaption>
12+
</figure>
1013
""".replace('\n', '')
1114

1215
NODES_TEST_LIST = [
13-
{'tag': 'p', 'children': ['Hello, world!']},
14-
{'tag': 'p', 'children': [{
15-
'tag': 'a',
16-
'attrs': {'href': 'https://telegra.ph/'},
17-
'children': ['Test link', '<', '/a', '>']
18-
}]
19-
}
20-
]
21-
22-
NODES_TEST_LIST_PY35 = [
23-
{'tag': 'p', 'children': ['Hello, world!']},
16+
{'tag': 'p', 'children': ['Hello, world!', {'tag': 'br'}]},
2417
{'tag': 'p', 'children': [{
2518
'tag': 'a',
2619
'attrs': {'href': 'https://telegra.ph/'},
2720
'children': ['Test link</a>']
2821
}]
29-
}
22+
},
23+
{'tag': 'figure', 'children': [
24+
{'tag': 'img', 'attrs': {'src': '/file/6c2ecfdfd6881d37913fa.png'}},
25+
{'tag': 'figcaption'}
26+
]}
3027
]
3128

3229

3330
class TestHTMLConverter(TestCase):
3431
def test_html_to_nodes(self):
32+
self.assertEqual(
33+
html_to_nodes(HTML_TEST_STR),
34+
NODES_TEST_LIST
35+
)
3536

36-
if sys.version_info.major == 3 and sys.version_info.minor >= 5:
37-
self.assertEqual(
38-
html_to_nodes(HTML_TEST_STR),
39-
NODES_TEST_LIST_PY35
40-
)
41-
else:
42-
self.assertEqual(
43-
html_to_nodes(HTML_TEST_STR),
44-
NODES_TEST_LIST
45-
)
37+
def test_nodes_to_html(self):
38+
self.assertEqual(
39+
nodes_to_html(NODES_TEST_LIST),
40+
HTML_TEST_STR
41+
)
4642

4743
def test_html_to_nodes_invalid_html(self):
4844
with self.assertRaises(InvalidHTML):
@@ -52,12 +48,6 @@ def test_html_to_nodes_not_allowed_tag(self):
5248
with self.assertRaises(NotAllowedTag):
5349
html_to_nodes('<script src="localhost"></script>')
5450

55-
def test_nodes_to_html(self):
56-
self.assertEqual(
57-
nodes_to_html(NODES_TEST_LIST),
58-
HTML_TEST_STR
59-
)
60-
6151
def test_nodes_to_html_nested(self):
6252
self.assertEqual(
6353
nodes_to_html([

0 commit comments

Comments
 (0)