Skip to content

Commit 4d90660

Browse files
authored
Merge pull request #4 from Danceiny/dev
update examples
2 parents 6d340e4 + d4f63ce commit 4d90660

35 files changed

+891
-433
lines changed

README.md

Lines changed: 153 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,159 @@
1616
- 安装稳定版:
1717
>`pip install -U parser_engine`
1818
19+
### 示例
20+
- 极简版,使用`CrawlSpider`的rules机制。
21+
```python
22+
from parser_engine import TemplateAnnotation
23+
@TemplateAnnotation(tpls="demo")
24+
class DemoSpider4(CrawlSpider):
25+
name = "demo4"
26+
start_urls = [
27+
"http://github.cannot.cc/baixing-helper"
28+
]
29+
```
30+
31+
- 使用scrapy_redis,解析start_urls的响应。
32+
```python
33+
from parser_engine import TemplateAnnotation
34+
from parser_engine.clue.spider import ClueSpider
35+
@TemplateAnnotation(start_url_tpl=({
36+
"name": "zhongguozhongqi_xiaoshouwangluo",
37+
"itemname": "HuocheDealerItem",
38+
"parent": {
39+
"xpath": "//tr[@class=\"bgcolor2\"]"
40+
},
41+
"fields": [
42+
{
43+
"key": "area",
44+
"xpath": "td[1]/text()",
45+
"value_type": "stripped_string"
46+
}, {
47+
"key": "leads_name",
48+
"xpath": "td[2]/text()",
49+
"value_type": "stripped_string"
50+
}, {
51+
"key": "address",
52+
"xpath": "td[3]/text()",
53+
"value_type": "stripped_string"
54+
}, {
55+
"key": "phone",
56+
"xpath": "td[5]/text()",
57+
"value_type": "stripped_string"
58+
}
59+
]
60+
}), channel='zhongguozhongqi', leads_src='中国重汽')
61+
class ZhongguozhongqiSpider(ClueSpider):
62+
name = 'zhongguozhongqi'
63+
def parse(self, response):
64+
items = self._parse_start_url(response)
65+
for item in items:
66+
phone = item.get('phone')
67+
if phone:
68+
item['phone'] = phone.replace('', ',')
69+
yield item
70+
self.finish_clue(response, len(items))
71+
```
72+
73+
- 使用scrapy_redis,灵活运用多种PE特性。
74+
```python
75+
from parser_engine.clue.spider import ClueSpider
76+
from parser_engine import TemplateAnnotation
77+
from parser_engine.clue.items import ClueItem
78+
from parser_engine.request import TaskRequest
79+
from scrapy import Request
80+
@TemplateAnnotation(start_url_tpl=({
81+
"name": "youka_shop_listing_api",
82+
"parent": {
83+
"json_key": "data",
84+
},
85+
"fields": [{
86+
"key": "totalPage",
87+
"json_key": "totalPage",
88+
89+
}, {
90+
"key": "ids",
91+
"json_path": "dataList[*].id"
92+
}]
93+
},),
94+
tpls=({
95+
"name": "youka_shop_detail_api",
96+
"itemname": "HuocheDealerItem",
97+
"parent": {
98+
"json_key": "data",
99+
},
100+
"fields": [{
101+
"key": "company_type",
102+
"json_key": "category",
103+
"mapper": {
104+
1: "二手车直营店",
105+
2: "4S店"
106+
}
107+
}, {
108+
"key": "dealer_id",
109+
"json_key": "id",
110+
"required": 1,
111+
}, {
112+
"key": "leads_name",
113+
"json_key": "shopName",
114+
}, {
115+
"key": "area",
116+
"json_path": "districtDto.districtName",
117+
"value_type": "singleton"
118+
}, {
119+
"key": "city",
120+
"json_path": "cityDto.cityName",
121+
"value_type": "singleton"
122+
}, {
123+
"key": "service_phone",
124+
"default_value": "",
125+
}, {
126+
"key": "wechat",
127+
"json_key": "wechat",
128+
}, {
129+
"key": "tags",
130+
"json_key": "tags",
131+
"join": ","
132+
}]
133+
}), channel='youka', leads_src='优卡')
134+
class YoukaSpider(ClueSpider):
135+
name = 'youka'
136+
custom_settings = {
137+
'CONCURRENT_REQUESTS': 2,
138+
'CONCURRENT_REQUESTS_PER_DOMAIN': 1
139+
}
140+
def parse(self, response):
141+
items = self._parse_start_url(response)
142+
meta = response.meta
143+
clue_id = meta.get('clue_id')
144+
from_url = response.request.url
145+
if meta.get('open_pages'):
146+
total_page = items[0]['totalPage']
147+
import re
148+
current_page = int(re.findall('page=(\\d+)', from_url)[0])
149+
for i in range(1, total_page + 1):
150+
if current_page == i:
151+
continue
152+
url = "http://www.china2cv.com/truck-foton-web/api/shop/v1/getShopList?page=%d&pageSize=10" % i
153+
yield ClueItem({"project": "huoche", "spider": self.name, "req": TaskRequest(
154+
url=url,
155+
meta={"from_clue_id": clue_id}
156+
)})
157+
for item in items:
158+
for id in item['ids']:
159+
r = Request(url="http://www.china2cv.com/truck-foton-web/api/shop/v1/getShopInfo?shopId=%d" % int(id),
160+
callback=self._response_downloaded)
161+
r.meta.update(rule=0, from_clue_id=clue_id)
162+
yield r
163+
164+
def process_results(self, response, results):
165+
for item in results:
166+
item['url'] = 'http://www.china2cv.com/storeDetail.html?typess=1&shopId=' + str(item['dealer_id'])
167+
return results
168+
```
169+
170+
更多请参考:[examples](./examples)
171+
19172
### 原理
20173
- 解析器
21174
>PE向调用方提供一套简单、易懂的参数,实际会将其`编译`成较为复杂的xpath表达式,再借助scrapy封装的解析器将所需内容提取出来。
@@ -41,8 +194,6 @@
41194
>一个简单的需求场景:API返回的性别字段是0和1,但是需要将其转换成"男"和"女"。
42195
43196
### 待做清单
44-
- 功能
45-
46197
- 优化
47198
- [ ] 支持直接在`Item`的类定义中定义模板
48199
>用法示例:原模板的`itemname`参数通过注解传参,其他的模板参数定义在`Item`类中,如下所示。
@@ -124,9 +275,6 @@ TemplateAnnotation注解中传进来的参数,除了下面列出的,其他
124275

125276
- tpls: 模板的数组,或者模板id的数组
126277

127-
其它约定:
128-
- Spider类的`name`类变量,会被翻译成`business`赋值给item。
129-
130278
具体请参考[decorator.py](./parser_engine/decorator.py)中的注释及源代码。
131279

132280
#### Html格式

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0.1.1
1+
0.1.2

demo/demo/items.py

Lines changed: 0 additions & 81 deletions
This file was deleted.

demo/demo/middlewares.py

Lines changed: 0 additions & 103 deletions
This file was deleted.

0 commit comments

Comments
 (0)