我正在尝试制作一个爬虫来从不提供任何 API 访问权限的站点中获取模板,以便以后显示为会员。我刚开始使用 CI,阅读了几次文档,您可以在下面找到我的第一个 OOP 方法。我的问题是我是否在 OOP 的正确道路上,或者我的代码是否(我确定有)任何可用的改进。我在网上阅读了很多 OOP 教程,人们似乎对 OOP 编码有不同的看法。
先感谢您。
<?php
class Crawler extends CI_Model {
function __construct(){
parent::__construct();
}
function get_with_curl($url) {
if(!ini_get('allow_url_fopen')) {
return $this->html = file_get_html($url);
} else {
$curl = curl_init();
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; MSIE 5.01; Windows NT 5.0)');
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl, CURLOPT_CONNECTTIMEOUT, 10);
$str = curl_exec($curl);
curl_close($curl);
return $this->html = str_get_html($str);
}
}
function array_arrange($links){
$links = array_merge(array_unique($links));
foreach (range(1, count($links), 2) as $k) {
unset($links[$k]);
}
return array_merge($links);
}
function diff($source,$links){
$this->db->like('source', $source);
$this->db->from('themes');
$total = $this->db->count_all_results();
if($total >= count($links)){
return false;
} else {
$diff = count($links)-$total;
$data = array_slice($links,-$diff,$diff,true);
return $data;
}
}
function get_links($url,$find){
$this->html = $this->get_with_curl($url);
foreach($this->html->find($find) as $v){
$data[] = $v->href;
}
$this->html->clear();
unset($this->html);
return $data;
}
function themefyme(){
$links = $this->get_links('http://themify.me/themes','ul[class=theme-list] li a');
$links = $this->array_arrange($links);
$links = $this->diff('themefyme',$links);
if($links){
$i = 0;
foreach($links as $link){
$this->html = $this->get_with_curl($link);
$data[$i]['source'] = 'themefyme';
$data[$i]['name'] = strtok($this->html->find('h1', 0)->plaintext,' ');
$data[$i]['link'] = $link;
$data[$i]['demo'] = 'http://themify.me/demo/#theme='.strtolower($data[$i]['name']);
$data[$i]['price'] = filter_var($this->html->find('h1 sup', 0)->plaintext, FILTER_SANITIZE_NUMBER_INT);
$data[$i]['description'] = $this->html->find('big', 0)->plaintext;
$data[$i]['features'] = $this->html->find('ul', 0)->plaintext;
$data[$i]['img_large'] = $this->html->find('.theme-large-screen img', 0)->src;
$data[$i]['img_thumb'] = 'http://themify.me/wp-content/themes/themify/thumb.php?src='.$data[$i]['img_large'].'&q=90&w=220';
$i++;
$this->html->clear();
unset($this->html);
}
$this->db->insert_batch('themes', $data);
return $data;
}
return false;
}
function themefuse(){
$links = $this->get_links('http://www.themefuse.com/wp-themes-shop/','.theme-img a');
$links = $this->array_arrange($links);
$links = $this->diff('themefuse',$links);
if($links){
$i = 0;
foreach($links as $link){
$this->html = $this->get_with_curl($link);
$data[$i]['source'] = 'themefuse';
$data[$i]['name'] = $this->html->find('.theme-price', 0)->plaintext;
$data[$i]['link'] = $link;
$data[$i]['demo'] = 'http://themefuse.com/demo/wp/'.strtolower($data[$i]['name']).'/';
$data[$i]['description'] = $this->html->find('.short-descr', 0)->plaintext;
$data[$i]['highlights'] = $this->html->find('.highlights', 0)->outertext;
$data[$i]['features'] = $this->html->find('.col-features', 0)->outertext;
$data[$i]['theme_info'] = $this->html->find('.col-themeinfo', 0)->outertext;
preg_match("/src=(.*?)&/",$this->html->find('.slideshow img', 0)->src, $img);
$data[$i]['img_large'] = $img[1];
$data[$i]['img_thumb'] = 'http://themefuse.com/wp-content/themes/themefuse/thumb.php?src='.$img[1].'&h=225&w=431&zc=1&q=100';
$i++;
$this->html->clear();
unset($this->html);
}
$this->db->insert_batch('themes', $data);
return $data;
}
return false;
}
}