A-A+

Guzzle 使用dome案例 及 多线程异步案例

2018年05月30日 21:00 汪洋大海 暂无评论 阅读 22 views 次

虽然早就知道很多人用 Guzzle 爬数据,但是我却从来没有真正实践过,因为在我的潜意识里,抓取是 Python 的地盘。不过前段时间,当我抓汽车之家数据的时候,好心人跟我提起 Goutte 搭配 Guzzle 是最好的爬虫,让我一直记挂在心上,加上最近打算更新一下车型数据,于是我便重写了抓取汽车之家数据的脚本。

因为我是通过接口抓取,而不是网页,所以暂时用不上 Goutte,只用 Guzzle 就可以了,抓取过程中需要注意两点:首先需要注意的是通过并发节省时间,其次需要注意的是失败重试的步骤。算了,我不想说了,直接贴代码吧。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
<?php
 
require "vendor/autoload.php";
 
use GuzzleHttp\Pool;
use GuzzleHttp\Client;
use GuzzleHttp\Middleware;
use GuzzleHttp\HandlerStack;
use GuzzleHttp\Psr7\Request;
 
// 品牌
$brands = [];
// 车系
$series = [];
// 车型
$models = [];
 
// 配置
$configs = [];
 
$timeout = 10;
$concurrency = 100;
 
ini_set("memory_limit", "512M");
 
$stack = HandlerStack::create();
$stack->push(Middleware::retry(
    function($retries) { return $retries < 3; },
    function($retries) { return pow(2, $retries - 1); }
));
 
$client = new Client([
    "debug" => true,
    "timeout" => $timeout,
    "base_uri" => "https://cars.app.autohome.com.cn",
    "headers" => [
        "User-Agent" => "Android\t6.0.1\tautohome\t8.3.0\tAndroid",
    ],
    "handler" => $stack,
]);
 
// 品牌列表页
$url = "/cars_v8.3.0/cars/brands-pm2.json";
 
$response = $client->get($url);
$contents = $response->getBody()->getContents();
$contents = json_decode($contents, true);
$contents = $contents["result"]["brandlist"];
 
foreach ($contents as $values) {
    $initial = $values["letter"];
 
    foreach ($values["list"] as $v) {
        $brands[$v["id"]] = [
            "id" => $v["id"],
            "name" => $v["name"],
            "initial" => $initial,
        ];
    }
}
 
$brands = array_values($brands);
 
###

$requests = function ($brands) {
    foreach ($brands as $v) {
        $id = $v["id"];
        // 品牌介绍页
        $url = "/cars_v8.3.0/cars/getbrandinfo-pm2-b{$id}.json";
        yield new Request("GET", $url);
    }
};
 
$pool = new Pool($client, $requests($brands), [
    "concurrency" => $concurrency,
    "fulfilled" => function ($response, $index) use(&$brands) {
        $contents = $response->getBody()->getContents();
        $contents = json_decode($contents, true);
        $contents = $contents["result"]["list"];
        $contents = $contents ? $contents[0]["description"] : "暂无";
        $contents = trim(str_replace(["\r\n", ","], ["\n", ","], $contents));
 
        $brands[$index]["description"] = $contents;
    },
]);
 
$pool->promise()->wait();
 
$requests = function ($brands) {
    foreach ($brands as $v) {
        $id = $v["id"];
        // 车系列表页
        $url = "/cars_v8.3.0/cars/seriesprice-pm2-b{$id}-t16-v8.3.0.json";
        yield new Request("GET", $url);
    }
};
 
$pool = new Pool($client, $requests($brands), [
    "concurrency" => $concurrency,
    "fulfilled" => function ($response, $index) use(&$series, $brands) {
        $contents = $response->getBody()->getContents();
        $contents = json_decode($contents, true);
        $contents = $contents["result"];
 
        $brand_id = $brands[$index]["id"];
 
        foreach (["fctlist", "otherfctlist"] as $field) {
            $values = $contents[$field];
 
            foreach ($values as $value) {
                $factory = $value["name"];
 
                foreach ($value["serieslist"] as $v) {
                    list($min, $max) = explode("-", $v["price"]) + [1 => 0];
 
                    $min_price = $min * 10000;
                    $max_price = $max * 10000;
 
                    if ($max_price == 0) {
                        $max_price = $min_price;
                    }
 
                    $series[$v["id"]] = [
                        "id" => $v["id"],
                        "name" => $v["name"],
                        "level" => $v["levelname"],
                        "factory" => $factory,
                        "min_price" => $min_price,
                        "max_price" => $max_price,
                        "brand_id" => $brand_id,
                    ];
                }
            }
        }
    },
]);
 
$pool->promise()->wait();
 
$series = array_values($series);
 
###

$requests = function ($series) {
    foreach ($series as $v) {
        $id = $v["id"];
        // 车型列表页
        $url = "/carinfo_v8.3.0/cars/seriessummary-pm2-s{$id}-t-c110100-v8.3.0.json";
        yield new Request("GET", $url);
    }
};
 
$pool = new Pool($client, $requests($series), [
    "concurrency" => $concurrency,
    "fulfilled" => function ($response, $index) use(&$models, $series) {
        $contents = $response->getBody()->getContents();
        $contents = json_decode($contents, true);
        $contents = $contents["result"]['enginelist'];
 
        $series_id = $series[$index]["id"];
 
        foreach ($contents as $values) {
            if (in_array($values["yearvalue"], [0, 1])) {
                continue;
            }
 
            foreach ($values["yearspeclist"] as $value) {
                foreach ($value["speclist"] as $v) {
                    if (isset($models[$v["id"]])) {
                        continue;
                    }
 
                    $price = $v["price"] * 10000;
 
                    $description = trim($v["description"]);
 
                    if (!$description) {
                        $description = "暂无";
                    }
 
                    $models[$v["id"]] = [
                        "id" => $v["id"],
                        "name" => $v["name"],
                        "status" => $v["state"],
                        "price" => $price,
                        "description" => $description,
                        "series_id" => $series_id,
                    ];
                }
            }
        }
    },
]);
 
$pool->promise()->wait();
 
$models = array_values($models);
 
###

$requests = function ($models) {
    foreach ($models as $v) {
        $id = $v["id"];
        // 车型参数页
        $url = "/cfg_v8.3.0/cars/speccompare.ashx?pm=2&type=1&specids={$id}&cityid=110100&site=2&pl=2";
        yield new Request("GET", $url);
    }
};
 
$pool = new Pool($client, $requests($models), [
    "concurrency" => $concurrency,
    "fulfilled" => function ($response, $index) use(&$models, &$configs) {
        $contents = $response->getBody()->getContents();
        $contents = json_decode($contents, true);
        $contents = $contents["result"];
 
        $models[$index]["config"] = [];
 
        foreach (["paramitems", "configitems"] as $key) {
            $values = $contents[$key];
 
            foreach ($values as $value) {
                $category = $value["itemtype"];
 
                foreach ($value["items"] as $v) {
                    $id = $v["id"];
 
                    if ($id < 1) {
                        continue;
                    }
 
                    $name = $v["name"];
                    $value = $v["modelexcessids"][0]["value"];
 
                    if ($value != "-") {
                        $models[$index]["config"][$id] = $value;
                    }
 
                    if (!isset($configs[$category][$id])) {
                        $configs[$category][$id] = [
                            "id" => $id,
                            "name" => $name,
                            "category" => $category,
                        ];
                    }
                }
            }
        }
 
        $models[$index]["config"] = json_encode(
            $models[$index]["config"], JSON_UNESCAPED_UNICODE
        );
    },
]);
 
$pool->promise()->wait();
 
$configs = call_user_func_array("array_merge", $configs);
 
// todo: 保存数据
 
?>

编写此类工具性质的脚本无需考虑面向对象之类的弯弯绕,一马平川的流水账往往是最好的选择。运行前记得先通过 composer 安装 guzzle,整个运行过程大概会执行三万次抓取请求,可以抓取汽车之家完整的品牌,车系,车型及配置等相关数据,总耗时大概十分钟左右,效率还是可以接受的。
文章为转载:https://huoding.com/2017/08/23/633

标签:

给我留言