diff --git a/favicon.php b/favicon.php new file mode 100644 index 0000000..99857a8 --- /dev/null +++ b/favicon.php @@ -0,0 +1,185 @@ +beer soda + * in return. + * Phyks + * --------------------------------------------------------------------------------- + */ + + +/** + * Try to get the favicon associated with some URLs, by parsing the header and + * trying to get the file favicon.ico at the root of the server + * + * @param an array $urls of URLs + * @return an array {'favicons', 'errors'}. `errors` is an array of URLs for which there could not be any fetched favicon. `favicons` is an array with URLs as keys and an array of favicon urls and sizes ({favicon_url, size}, associative array). + */ +function getFavicon($urls) { + $favicons = array(); + $errors = array(); + + // Convert array to the good format for curl downloader + $curl_urls = array(); + foreach($urls as $url) { + $curl_urls[] = array('url'=>$url); + } + + $contents = curl_downloader($curl_urls); + foreach($contents['status_codes'] as $url=>$status) { + if($status != 200) { + $errors[] = $url; + } + } + + foreach($contents['results'] as $url=>$content) { + $content = substr($content, 0, strpos($content, '')).''; // We don't need the full page, just the + + $html = new DOMDocument(); + $html->strictErrorChecking = false; + @$html->loadHTML($content); + $xml = simplexml_import_dom($html); + + // Try to fetch the favicon URL from the tag + foreach($xml->head->children() as $head_tag) { + if($head_tag->getName() != 'link') { + continue; + } + $go_next_tag = false; + foreach($head_tag->attributes() as $key=>$attribute) { + if($go_next_tag || $key != 'rel') { + continue; + } + if(strstr((string) $attribute, 'icon')) { + if(isset($head_tag->attributes()['sizes'])) { + $sizes = (string)$head_tag->attributes()['sizes']; + } + else { + $sizes = ''; + } + $favicons[$url] = array( + 'favicon_url'=>(string) $head_tag->attributes()['href'], + 'sizes'=>$sizes + ); + $go_next_tag = true; + } + } + } + } + + // Add to errors the URLs without any favicons associated + $favicons_keys = array_keys($favicons); + foreach($contents['results'] as $url=>$content) { + if(!in_array($url, $favicons_keys)) { + $errors[] = $url; + } + } + + // Check for errorred feeds wether the favicon.ico file at the root exists + $second_try = array(); + foreach ($errors as $url) { + $second_try[] = array( + 'input_url'=>$url, + 'url'=>rtrim(trim($url), '/?') . '/favicon.ico' + ); + } + $second_try_curl = curl_downloader($second_try, false); + $errors = array(); + + foreach($second_try as $tested_url) { + $status_code = (int) $second_try_curl['status_codes'][$tested_url['url']]; + if ($status_code >= 200 && $status_code < 400) { + $favicons[$tested_url['input_url']] = array( + 'favicon_url'=>$tested_url['url'], + 'sizes'=>'' + ); + } + else { + $errors[] = $tested_url['input_url']; + } + } + + + return array('favicons'=>$favicons, 'errors'=>$errors); +} + + +/** + * Downloads all the urls in the array $urls and returns an array with the results and the http status_codes. + * + * Mostly inspired by blogotext by timovn : https://github.com/timovn/blogotext/blob/master/inc/fich.php + * + * @todo If open_basedir or safe_mode, Curl will not follow redirections : + * https://stackoverflow.com/questions/24687145/curlopt-followlocation-and-curl-multi-and-safe-mode + * + * @param an array $urls of associative arrays {'url', 'post'} for each URL. 'post' is a JSON array of data to send _via_ POST. + * @return an array {'results', 'status_code'}, results being an array of the retrieved contents, indexed by URLs, and 'status_codes' being an array of status_code, indexed by URL. + */ +function curl_downloader($urls, $fetch_content=true) { + $chunks = array_chunk($urls, 40, true); // Chunks of 40 urls because curl has problems with too big "multi" requests + $results = array(); + $status_codes = array(); + + if (ini_get('open_basedir') == '' && ini_get('safe_mode') === false) { // Disable followlocation option if this is activated, to avoid warnings + $follow_redirect = true; + } + else { + $follow_redirect = false; + } + + foreach ($chunks as $chunk) { + $multihandler = curl_multi_init(); + $handlers = array(); + $total_feed_chunk = count($chunk) + count($results); + + foreach ($chunk as $i=>$url_array) { + $url = $url_array['url']; + set_time_limit(20); // Reset max execution time + $handlers[$i] = curl_init($url); + curl_setopt_array($handlers[$i], array( + CURLOPT_RETURNTRANSFER => TRUE, + CURLOPT_CONNECTTIMEOUT => 10, + CURLOPT_TIMEOUT => 15, + CURLOPT_FOLLOWLOCATION => $follow_redirect, + CURLOPT_MAXREDIRS => 5, + CURLOPT_USERAGENT => $_SERVER['HTTP_USER_AGENT'], // Add a user agent to prevent problems with some feeds + CURLOPT_HEADER => $fetch_content ? FALSE : TRUE, + CURLOPT_NOBODY => $fetch_content ? FALSE : TRUE, + )); + if (!empty($url_array['post'])) { + curl_setopt($handlers[$i], CURLOPT_POST, true); + curl_setopt($handlers[$i], CURLOPT_POSTFIELDS, json_decode($url_array['post'], true)); + } + + curl_multi_add_handle($multihandler, $handlers[$i]); + } + + do { + curl_multi_exec($multihandler, $active); + curl_multi_select($multihandler); + } while ($active > 0); + + foreach ($chunk as $i=>$url_array) { + $url = $url_array['url']; + $results[$url] = curl_multi_getcontent($handlers[$i]); + $status_codes[$url] = curl_getinfo($handlers[$i], CURLINFO_HTTP_CODE); + curl_multi_remove_handle($multihandler, $handlers[$i]); + curl_close($handlers[$i]); + } + curl_multi_close($multihandler); + } + + return array('results'=>$results, 'status_codes'=>$status_codes); +} + +