|
16 | 16 | - 安装稳定版:
|
17 | 17 | >`pip install -U parser_engine`
|
18 | 18 |
|
| 19 | +### 示例 |
| 20 | +- 极简版,使用`CrawlSpider`的rules机制。 |
| 21 | +```python |
| 22 | +from parser_engine import TemplateAnnotation |
| 23 | +@TemplateAnnotation(tpls="demo") |
| 24 | +class DemoSpider4(CrawlSpider): |
| 25 | + name = "demo4" |
| 26 | + start_urls = [ |
| 27 | + "http://github.cannot.cc/baixing-helper" |
| 28 | + ] |
| 29 | +``` |
| 30 | + |
| 31 | +- 使用scrapy_redis,解析start_urls的响应。 |
| 32 | +```python |
| 33 | +from parser_engine import TemplateAnnotation |
| 34 | +from parser_engine.clue.spider import ClueSpider |
| 35 | +@TemplateAnnotation(start_url_tpl=({ |
| 36 | + "name": "zhongguozhongqi_xiaoshouwangluo", |
| 37 | + "itemname": "HuocheDealerItem", |
| 38 | + "parent": { |
| 39 | + "xpath": "//tr[@class=\"bgcolor2\"]" |
| 40 | + }, |
| 41 | + "fields": [ |
| 42 | + { |
| 43 | + "key": "area", |
| 44 | + "xpath": "td[1]/text()", |
| 45 | + "value_type": "stripped_string" |
| 46 | + }, { |
| 47 | + "key": "leads_name", |
| 48 | + "xpath": "td[2]/text()", |
| 49 | + "value_type": "stripped_string" |
| 50 | + }, { |
| 51 | + "key": "address", |
| 52 | + "xpath": "td[3]/text()", |
| 53 | + "value_type": "stripped_string" |
| 54 | + }, { |
| 55 | + "key": "phone", |
| 56 | + "xpath": "td[5]/text()", |
| 57 | + "value_type": "stripped_string" |
| 58 | + } |
| 59 | + ] |
| 60 | +}), channel='zhongguozhongqi', leads_src='中国重汽') |
| 61 | +class ZhongguozhongqiSpider(ClueSpider): |
| 62 | + name = 'zhongguozhongqi' |
| 63 | + def parse(self, response): |
| 64 | + items = self._parse_start_url(response) |
| 65 | + for item in items: |
| 66 | + phone = item.get('phone') |
| 67 | + if phone: |
| 68 | + item['phone'] = phone.replace('、', ',') |
| 69 | + yield item |
| 70 | + self.finish_clue(response, len(items)) |
| 71 | +``` |
| 72 | + |
| 73 | +- 使用scrapy_redis,灵活运用多种PE特性。 |
| 74 | +```python |
| 75 | +from parser_engine.clue.spider import ClueSpider |
| 76 | +from parser_engine import TemplateAnnotation |
| 77 | +from parser_engine.clue.items import ClueItem |
| 78 | +from parser_engine.request import TaskRequest |
| 79 | +from scrapy import Request |
| 80 | +@TemplateAnnotation(start_url_tpl=({ |
| 81 | + "name": "youka_shop_listing_api", |
| 82 | + "parent": { |
| 83 | + "json_key": "data", |
| 84 | + }, |
| 85 | + "fields": [{ |
| 86 | + "key": "totalPage", |
| 87 | + "json_key": "totalPage", |
| 88 | + |
| 89 | + }, { |
| 90 | + "key": "ids", |
| 91 | + "json_path": "dataList[*].id" |
| 92 | + }] |
| 93 | + },), |
| 94 | + tpls=({ |
| 95 | + "name": "youka_shop_detail_api", |
| 96 | + "itemname": "HuocheDealerItem", |
| 97 | + "parent": { |
| 98 | + "json_key": "data", |
| 99 | + }, |
| 100 | + "fields": [{ |
| 101 | + "key": "company_type", |
| 102 | + "json_key": "category", |
| 103 | + "mapper": { |
| 104 | + 1: "二手车直营店", |
| 105 | + 2: "4S店" |
| 106 | + } |
| 107 | + }, { |
| 108 | + "key": "dealer_id", |
| 109 | + "json_key": "id", |
| 110 | + "required": 1, |
| 111 | + }, { |
| 112 | + "key": "leads_name", |
| 113 | + "json_key": "shopName", |
| 114 | + }, { |
| 115 | + "key": "area", |
| 116 | + "json_path": "districtDto.districtName", |
| 117 | + "value_type": "singleton" |
| 118 | + }, { |
| 119 | + "key": "city", |
| 120 | + "json_path": "cityDto.cityName", |
| 121 | + "value_type": "singleton" |
| 122 | + }, { |
| 123 | + "key": "service_phone", |
| 124 | + "default_value": "", |
| 125 | + }, { |
| 126 | + "key": "wechat", |
| 127 | + "json_key": "wechat", |
| 128 | + }, { |
| 129 | + "key": "tags", |
| 130 | + "json_key": "tags", |
| 131 | + "join": "," |
| 132 | + }] |
| 133 | + }), channel='youka', leads_src='优卡') |
| 134 | +class YoukaSpider(ClueSpider): |
| 135 | + name = 'youka' |
| 136 | + custom_settings = { |
| 137 | + 'CONCURRENT_REQUESTS': 2, |
| 138 | + 'CONCURRENT_REQUESTS_PER_DOMAIN': 1 |
| 139 | + } |
| 140 | + def parse(self, response): |
| 141 | + items = self._parse_start_url(response) |
| 142 | + meta = response.meta |
| 143 | + clue_id = meta.get('clue_id') |
| 144 | + from_url = response.request.url |
| 145 | + if meta.get('open_pages'): |
| 146 | + total_page = items[0]['totalPage'] |
| 147 | + import re |
| 148 | + current_page = int(re.findall('page=(\\d+)', from_url)[0]) |
| 149 | + for i in range(1, total_page + 1): |
| 150 | + if current_page == i: |
| 151 | + continue |
| 152 | + url = "http://www.china2cv.com/truck-foton-web/api/shop/v1/getShopList?page=%d&pageSize=10" % i |
| 153 | + yield ClueItem({"project": "huoche", "spider": self.name, "req": TaskRequest( |
| 154 | + url=url, |
| 155 | + meta={"from_clue_id": clue_id} |
| 156 | + )}) |
| 157 | + for item in items: |
| 158 | + for id in item['ids']: |
| 159 | + r = Request(url="http://www.china2cv.com/truck-foton-web/api/shop/v1/getShopInfo?shopId=%d" % int(id), |
| 160 | + callback=self._response_downloaded) |
| 161 | + r.meta.update(rule=0, from_clue_id=clue_id) |
| 162 | + yield r |
| 163 | + |
| 164 | + def process_results(self, response, results): |
| 165 | + for item in results: |
| 166 | + item['url'] = 'http://www.china2cv.com/storeDetail.html?typess=1&shopId=' + str(item['dealer_id']) |
| 167 | + return results |
| 168 | +``` |
| 169 | + |
| 170 | +更多请参考:[examples](./examples)。 |
| 171 | + |
19 | 172 | ### 原理
|
20 | 173 | - 解析器
|
21 | 174 | >PE向调用方提供一套简单、易懂的参数,实际会将其`编译`成较为复杂的xpath表达式,再借助scrapy封装的解析器将所需内容提取出来。
|
|
41 | 194 | >一个简单的需求场景:API返回的性别字段是0和1,但是需要将其转换成"男"和"女"。
|
42 | 195 |
|
43 | 196 | ### 待做清单
|
44 |
| -- 功能 |
45 |
| - |
46 | 197 | - 优化
|
47 | 198 | - [ ] 支持直接在`Item`的类定义中定义模板
|
48 | 199 | >用法示例:原模板的`itemname`参数通过注解传参,其他的模板参数定义在`Item`类中,如下所示。
|
@@ -124,9 +275,6 @@ TemplateAnnotation注解中传进来的参数,除了下面列出的,其他
|
124 | 275 |
|
125 | 276 | - tpls: 模板的数组,或者模板id的数组
|
126 | 277 |
|
127 |
| -其它约定: |
128 |
| -- Spider类的`name`类变量,会被翻译成`business`赋值给item。 |
129 |
| - |
130 | 278 | 具体请参考[decorator.py](./parser_engine/decorator.py)中的注释及源代码。
|
131 | 279 |
|
132 | 280 | #### Html格式
|
|
0 commit comments