| вопрос : мне нужно спарсить сценаристов с imdb.de актеров иа беру так:
Код:
...
'actors_links' => 'table.cast tr td.nm a',
'actor_birthday' => 'h5:contains("Geburtstag") + div.info-content',
'actor_death' => 'h5:contains("Todestag") + div.info-content',
'actor_height' => 'h5:contains("GrцЯe") + div.info-content',
'actor_nickname' => 'h5:contains("Spitzname") + div.info-content'
...
private function get_actors(){
$ch = curl_init();
curl_setopt($ch, CURLOPT_USERAGENT, USER_AGENT);
curl_setopt($ch, CURLOPT_URL, BASE_PARSE_URL.$this->link.'fullcredits');
curl_setopt($ch, CURLOPT_FAILONERROR, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_AUTOREFERER, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER,true);
curl_setopt($ch, CURLOPT_TIMEOUT, 3);
do{
// curl_setopt($ch, CURLOPT_PROXY, $this->get_random_proxy());
$html = curl_exec($ch);
$error = curl_errno($ch);
}
while($error!=0);
curl_close($ch);
if($html){
$doc = phpQuery::newDocumentHTML($html);
$actors_links = array();
$actors_links_raw = $doc[$this->path['actors_links']];
foreach ($actors_links_raw as $actors_link) {
//TODO:надо прикрутить проверку чтобы повторно не парсить
$actors_links[] = array(
'link' => $actors_link->getAttribute('href'),
'name' => $actors_link->nodeValue
);
}
$actors = array();
foreach ($actors_links as $actor_link) {
$actors[] = $this->get_actor_info($actor_link);
}
$this->film['actors'] = $actors;
}
}
private function get_actor_info($link){
set_time_limit(0);
$ch = curl_init();
curl_setopt($ch, CURLOPT_USERAGENT, USER_AGENT);
curl_setopt($ch, CURLOPT_URL, BASE_PARSE_URL.$link['link']);
curl_setopt($ch, CURLOPT_FAILONERROR, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_AUTOREFERER, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER,true);
curl_setopt($ch, CURLOPT_TIMEOUT, 3);
do{
// curl_setopt($ch, CURLOPT_PROXY, $this->get_random_proxy());
$html = curl_exec($ch);
$error = curl_errno($ch);
}
while($error!=0);
curl_close($ch);
if($html){
$actor_doc = phpQuery::newDocumentHTML($html);
$actor = array();
$actor['link'] = $link['link'];
$actor['id'] = preg_replace('/[^0-9]/s', '', $link['link']);
$actor['name'] = $link['name'];
$birthday = pq($this->path['actor_birthday']);
if($birthday->text()!=''){
$birthday = preg_replace('/\[(.*)?]/s', '', $birthday->text());
$birthday = explode(',', $birthday);
// получаем место рождения, в зависимости от указанных данных
if(count($birthday)>3){
$data = array(
'country' => trim($birthday[3]),
'state' => trim($birthday[2]),
'city' => trim($birthday[1]),
'birthday_date' => $birthday[0]
);
} elseif(count($birthday)==3) {
$data = array(
'country' => trim($birthday[2]),
'city' => trim($birthday[1]),
'birthday_date' => $birthday[0]
);
} elseif(count($birthday)==2){
$data = array(
'country' => trim($birthday[1]),
'birthday_date' => $birthday[0]
);
} elseif(count($birthday)==1){
$data = array(
'birthday_date' => $birthday[0]
);
}
// парсим дату рождения
$tmpDate = explode(' ', $data['birthday_date']);
foreach($tmpDate as $key=>$value){
$tmpDate[$key] = trim($value);
}
if(count ($tmpDate) == 3){
// день-месяц-год
$tmpDate[0] = str_replace('.', '', $tmpDate[0]);
$data['birthday_date'] = $tmpDate[2].'-'.get_month($tmpDate[1]).'-'.$tmpDate[0];
} elseif(count ($tmpDate) == 2) {
// месяц-год
$data['birthday_date'] = $tmpDate[1].'-'.get_month($tmpDate[0]).'-00';
} elseif(count ($tmpDate) == 1) {
// год
$data['birthday_date'] = $tmpDate[0].'-00-00';
}
$actor['birthday'] = $data;
}
$death = pq($this->path['actor_death']);
if($death->text()!=''){
// дата смерти
$date = trim(preg_replace('/\,(.*)/s', '', $death->text()));
$tmpDate = explode(' ', $date);
foreach($tmpDate as $key=>$value){
$tmpDate[$key] = trim($value);
}
if(count ($tmpDate) == 3){
// день-месяц-год
$tmpDate[0] = str_replace('.', '', $tmpDate[0]);
$actor['death'] = $tmpDate[2].'-'.get_month($tmpDate[1]).'-'.$tmpDate[0];
} elseif(count ($tmpDate) == 2) {
// месяц-год
$actor['death'] = $tmpDate[1].'-'.get_month($tmpDate[0]).'-00';
} elseif(count ($tmpDate) == 1) {
// год
$actor['death'] = $tmpDate[0].'-00-00';
}
}
$height = pq($this->path['actor_height']);
if($height->text()!=''){
$actor['height'] = trim(preg_replace('/[^0-9,]/s', '', $height->text()));
}
$nickname = pq($this->path['actor_nickname']);
if($nickname->text()!=''){
$actor['nickname'] = trim($nickname->text());
}
return $actor;
} но они находятся в fullcredits. как мне спарсить сценаристов ? |