forked from PRNDA/you2php
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathQueryList.php
472 lines (437 loc) · 16 KB
/
QueryList.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
<?php
namespace QL;
use phpQuery,Exception,ReflectionClass;
use Monolog\Logger;
use Monolog\Handler\StreamHandler;
/**
* QueryList
*
* 一个基于phpQuery的通用列表采集类
*
* @author Jaeger
* @email [email protected]
* @link http://git.oschina.net/jae/QueryList
* @version 3.2.1
*
* @example
*
//获取CSDN移动开发栏目下的文章列表标题
$hj = QueryList::Query('http://mobile.csdn.net/',array("title"=>array('.unit h1','text')));
print_r($hj->data);
//回调函数1
function callfun1($content,$key)
{
return '回调函数1:'.$key.'-'.$content;
}
class HJ{
//回调函数2
static public function callfun2($content,$key)
{
return '回调函数2:'.$key.'-'.$content;
}
}
//获取CSDN文章页下面的文章标题和内容
$url = 'http://www.csdn.net/article/2014-06-05/2820091-build-or-buy-a-mobile-game-backend';
$rules = array(
'title'=>array('h1','text','','callfun1'), //获取纯文本格式的标题,并调用回调函数1
'summary'=>array('.summary','text','-input strong'), //获取纯文本的文章摘要,但保strong标签并去除input标签
'content'=>array('.news_content','html','div a -.copyright'), //获取html格式的文章内容,但过滤掉div和a标签,去除类名为copyright的元素
'callback'=>array('HJ','callfun2') //调用回调函数2作为全局回调函数
);
$rang = '.left';
$hj = QueryList::Query($url,$rules,$rang);
print_r($hj->data);
//继续获取右边相关热门文章列表的标题以及链接地址
$hj->setQuery(array('title'=>array('','text'),'url'=>array('a','href')),'#con_two_2 li');
//输出数据
echo $hj->getData();
*/
class QueryList
{
public $data;
public $html;
private $page;
private $pqHtml;
private $outputEncoding = false;
private $inputEncoding = false;
private $htmlEncoding;
public static $logger = null;
public static $instances;
public function __construct() {
}
/**
* 静态方法,访问入口
* @param string $page 要抓取的网页URL地址(支持https);或者是html源代码
* @param array $rules 【选择器数组】说明:格式array("名称"=>array("选择器","类型"[,"标签过滤列表"][,"回调函数"]),.......[,"callback"=>"全局回调函数"]);
* 【选择器】说明:可以为任意的jQuery选择器语法
* 【类型】说明:值 "text" ,"html" ,"HTML标签属性" ,
* 【标签过滤列表】:可选,要过滤的选择器名,多个用空格隔开,当标签名前面添加减号(-)时(此时标签可以为任意的元素选择器),表示移除该标签以及标签内容,否则当【类型】值为text时表示需要保留的HTML标签,为html时表示要过滤掉的HTML标签
* 【回调函数】/【全局回调函数】:可选,字符串(函数名) 或 数组(array("类名","类的静态方法")),回调函数应有俩个参数,第一个参数是选择到的内容,第二个参数是选择器数组下标,回调函数会覆盖全局回调函数
*
* @param string $range 【块选择器】:指 先按照规则 选出 几个大块 ,然后再分别再在块里面 进行相关的选择
* @param string $outputEncoding【输出编码格式】指要以什么编码输出(UTF-8,GB2312,.....),防止出现乱码,如果设置为 假值 则不改变原字符串编码
* @param string $inputEncoding 【输入编码格式】明确指定输入的页面编码格式(UTF-8,GB2312,.....),防止出现乱码,如果设置为 假值 则自动识别
* @param bool|false $removeHead 【是否移除页面头部区域】 乱码终极解决方案
* @return mixed
*/
public static function Query($page,array $rules, $range = '', $outputEncoding = null, $inputEncoding = null,$removeHead = false)
{
return self::getInstance()->_query($page, $rules, $range, $outputEncoding, $inputEncoding,$removeHead);
}
/**
* 运行QueryList扩展
* @param $class
* @param array $args
* @return mixed
* @throws Exception
*/
public static function run($class,$args = array())
{
$extension = self::getInstance("QL\\Ext\\{$class}");
return $extension->run($args);
}
/**
* 日志设置
* @param $handler
*/
public static function setLog($handler)
{
if(class_exists('Monolog\Logger'))
{
if(is_string($handler))
{
$handler = new StreamHandler($handler,Logger::INFO);
}
self::$logger = new Logger('QueryList');
self::$logger->pushHandler($handler);
}else{
throw new Exception("You need to install the package [monolog/monolog]");
}
}
/**
* 获取任意实例
* @return mixed
* @throws Exception
*/
public static function getInstance()
{
$args = func_get_args();
count($args) || $args = array('QL\QueryList');
$key = md5(serialize($args));
$className = array_shift($args);
if(!class_exists($className)) {
throw new Exception("no class {$className}");
}
if(!isset(self::$instances[$key])) {
$rc = new ReflectionClass($className);
self::$instances[$key] = $rc->newInstanceArgs($args);
}
return self::$instances[$key];
}
/**
* 获取目标页面源码(主要用于调试)
* @param bool|true $rel
* @return string
*/
public function getHtml($rel = true)
{
return $rel?$this->qpHtml:$this->html;
}
/**
* 获取采集结果数据
* @param callback $callback
* @return array
*/
public function getData($callback = null)
{
if(is_callable($callback)){
return array_map($callback,$this->data);
}
return $this->data;
}
/**
* 重新设置选择器
* @param $rules
* @param string $range
* @param string $outputEncoding
* @param string $inputEncoding
* @param bool|false $removeHead
* @return QueryList
*/
public function setQuery(array $rules, $range = '',$outputEncoding = null, $inputEncoding = null,$removeHead = false)
{
return $this->_query($this->html,$rules, $range, $outputEncoding, $inputEncoding,$removeHead);
}
private function _query($page,array $rules, $range, $outputEncoding, $inputEncoding,$removeHead)
{
$this->data = array();
$this->page = $page;
$this->html = $this->_isURL($this->page)?$this->_request($this->page):$this->page;
$outputEncoding && $this->outputEncoding = $outputEncoding;
$inputEncoding && $this->inputEncoding = $inputEncoding;
$removeHead && $this->html = $this->_removeHead($this->html);
$this->pqHtml = '';
if(empty($this->html)){
$this->_log('The received content is empty!','error');
trigger_error('The received content is empty!',E_USER_NOTICE);
}
//获取编码格式
$this->htmlEncoding = $this->inputEncoding?$this->inputEncoding:$this->_getEncode($this->html);
// $this->html = $this->_removeTags($this->html,array('script','style'));
$this->regArr = $rules;
$this->regRange = $range;
$this->_getList();
$this->_log();
return $this;
}
private function _getList()
{
$this->inputEncoding && phpQuery::$defaultCharset = $this->inputEncoding;
$document = phpQuery::newDocumentHTML($this->html);
$this->qpHtml = $document->htmlOuter();
if (!empty($this->regRange)) {
$robj = pq($document)->find($this->regRange);
$i = 0;
foreach ($robj as $item) {
while (list($key, $reg_value) = each($this->regArr)) {
if($key=='callback')continue;
$tags = isset($reg_value[2])?$reg_value[2]:'';
$iobj = pq($item)->find($reg_value[0]);
switch ($reg_value[1]) {
case 'text':
$this->data[$i][$key] = $this->_allowTags(pq($iobj)->html(),$tags);
break;
case 'html':
$this->data[$i][$key] = $this->_stripTags(pq($iobj)->html(),$tags);
break;
default:
$this->data[$i][$key] = pq($iobj)->attr($reg_value[1]);
break;
}
if(isset($reg_value[3])){
$this->data[$i][$key] = call_user_func($reg_value[3],$this->data[$i][$key],$key);
}else if(isset($this->regArr['callback'])){
$this->data[$i][$key] = call_user_func($this->regArr['callback'],$this->data[$i][$key],$key);
}
}
//重置数组指针
reset($this->regArr);
$i++;
}
} else {
while (list($key, $reg_value) = each($this->regArr)) {
if($key=='callback')continue;
$document = phpQuery::newDocumentHTML($this->html);
$tags = isset($reg_value[2])?$reg_value[2]:'';
$lobj = pq($document)->find($reg_value[0]);
$i = 0;
foreach ($lobj as $item) {
switch ($reg_value[1]) {
case 'text':
$this->data[$i][$key] = $this->_allowTags(pq($item)->html(),$tags);
break;
case 'html':
$this->data[$i][$key] = $this->_stripTags(pq($item)->html(),$tags);
break;
default:
$this->data[$i][$key] = pq($item)->attr($reg_value[1]);
break;
}
if(isset($reg_value[3])){
$this->data[$i][$key] = call_user_func($reg_value[3],$this->data[$i][$key],$key);
}else if(isset($this->regArr['callback'])){
$this->data[$i][$key] = call_user_func($this->regArr['callback'],$this->data[$i][$key],$key);
}
$i++;
}
}
}
if ($this->outputEncoding) {
//编码转换
$this->data = $this->_arrayConvertEncoding($this->data, $this->outputEncoding, $this->htmlEncoding);
}
phpQuery::$documents = array();
}
/**
* URL请求
* @param $url
* @return string
*/
private function _request($url)
{
if(function_exists('curl_init')){
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_AUTOREFERER, true);
curl_setopt($ch, CURLOPT_REFERER, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36');
$result = curl_exec($ch);
curl_close($ch);
}elseif(version_compare(PHP_VERSION, '5.0.0')>=0){
$opts = array(
'http' => array(
'header' => "Referer:{$url}"
)
);
$result = file_get_contents($url,false,stream_context_create($opts));
}else{
$result = file_get_contents($url);
}
return $result;
}
/**
* 移除页面head区域代码
* @param $html
* @return mixed
*/
private function _removeHead($html)
{
return preg_replace('/<head.+?>.+<\/head>/is','<head></head>',$html);
}
/**
* 获取文件编码
* @param $string
* @return string
*/
private function _getEncode($string)
{
return mb_detect_encoding($string, array('ASCII', 'GB2312', 'GBK', 'UTF-8'));
}
/**
* 转换数组值的编码格式
* @param array $arr
* @param string $toEncoding
* @param string $fromEncoding
* @return array
*/
private function _arrayConvertEncoding($arr, $toEncoding, $fromEncoding)
{
eval('$arr = '.iconv($fromEncoding, $toEncoding.'//IGNORE', var_export($arr,TRUE)).';');
return $arr;
}
/**
* 简单的判断一下参数是否为一个URL链接
* @param string $str
* @return boolean
*/
private function _isURL($str)
{
if (preg_match('/^http(s)?:\\/\\/.+/', $str)) {
return true;
}
return false;
}
/**
* 去除特定的html标签
* @param string $html
* @param string $tags_str 多个标签名之间用空格隔开
* @return string
*/
private function _stripTags($html,$tags_str)
{
$tagsArr = $this->_tag($tags_str);
$html = $this->_removeTags($html,$tagsArr[1]);
$p = array();
foreach ($tagsArr[0] as $tag) {
$p[]="/(<(?:\/".$tag."|".$tag.")[^>]*>)/i";
}
$html = preg_replace($p,"",trim($html));
return $html;
}
/**
* 保留特定的html标签
* @param string $html
* @param string $tags_str 多个标签名之间用空格隔开
* @return string
*/
private function _allowTags($html,$tags_str)
{
$tagsArr = $this->_tag($tags_str);
$html = $this->_removeTags($html,$tagsArr[1]);
$allow = '';
foreach ($tagsArr[0] as $tag) {
$allow .= "<$tag> ";
}
return strip_tags(trim($html),$allow);
}
private function _tag($tags_str)
{
$tagArr = preg_split("/\s+/",$tags_str,-1,PREG_SPLIT_NO_EMPTY);
$tags = array(array(),array());
foreach($tagArr as $tag)
{
if(preg_match('/-(.+)/', $tag,$arr))
{
array_push($tags[1], $arr[1]);
}else{
array_push($tags[0], $tag);
}
}
return $tags;
}
/**
* 移除特定的html标签
* @param string $html
* @param array $tags 标签数组
* @return string
*/
private function _removeTags($html,$tags)
{
$tag_str = '';
if(count($tags))
{
foreach ($tags as $tag) {
$tag_str .= $tag_str?','.$tag:$tag;
}
phpQuery::$defaultCharset = $this->inputEncoding?$this->inputEncoding:$this->htmlEncoding;
$doc = phpQuery::newDocumentHTML($html);
pq($doc)->find($tag_str)->remove();
$html = pq($doc)->htmlOuter();
$doc->unloadDocument();
}
return $html;
}
/**
* 打印日志
* @param string $message
* @param string $level
*/
private function _log($message = '',$level = 'info')
{
if(!is_null(self::$logger))
{
$url = $this->_isURL($this->page)?$this->page:'[html]';
$count = count($this->data);
$level = empty($level)?($count?'info':'warning'):$level;
$message = empty($message)?($count?'Get data successfully':'Get data failed'):$message;
self::$logger->$level($message,array(
'page' => $url,
'count' => $count
));
}
}
}
/*
class Autoload
{
public static function load($className)
{
$files = array(
sprintf('%s/extensions/%s.php',__DIR__,$className),
sprintf('%s/extensions/vendors/%s.php',__DIR__,$className)
);
foreach ($files as $file) {
if(is_file($file)){
require $file;
return true;
}
}
return false;
}
}
spl_autoload_register(array('Autoload','load'));
*/