A PHP library to try to get favicons from URLs, to avoid using Google S2 and cie.

favicon.php 8.0KB


  1. <?php
  2. /** Favicons Lib
  3. * ------------
  4. * @copyright SODAWARE License (See below)
  5. * @brief Simple lib to try to get favicons from URLs.
  6. */
  7. /* LICENSE
  8. * --------------------------------------------------------------------------------
  9. * "THE NO-ALCOHOL BEER-WARE LICENSE" (Revision 42):
  10. * Phyks (webmaster@phyks.me) wrote this file. As long as you retain this notice you
  11. * can do whatever you want with this stuff (and you can also do whatever you want
  12. * with this stuff without retaining it, but that's not cool...). If we meet some
  13. * day, and you think this stuff is worth it, you can buy me a <del>beer</del> soda
  14. * in return.
  15. * Phyks
  16. * ---------------------------------------------------------------------------------
  17. */
  18. /**
  19. * Try to get the favicon associated with some URLs, by parsing the header and
  20. * trying to get the file favicon.ico at the root of the server
  21. *
  22. * @param an array $urls of URLs
  23. * @return an array {'favicons', 'errors'}. `errors` is an array of URLs for which there could not be any fetched favicon. `favicons` is an array with URLs as keys and an array of favicon urls and sizes ({favicon_url, size}, associative array).
  24. */
  25. function getFavicon($urls) {
  26. $favicons = array();
  27. $errors = array();
  28. // Convert array to the good format for curl downloader
  29. $curl_urls = array();
  30. foreach($urls as $url) {
  31. if (endswith($url, '.html')) { // Only check html files using first method
  32. $curl_urls[] = array('url'=>$url);
  33. }
  34. else {
  35. $errors[] = $url;
  36. }
  37. }
  38. $contents = curl_downloader($curl_urls);
  39. foreach($contents['status_codes'] as $url=>$status) {
  40. if($status != 200) {
  41. $errors[] = $url;
  42. }
  43. }
  44. foreach($contents['results'] as $url=>$content) {
  45. $content = substr($content, 0, strpos($content, '</head>')).'</head></html>'; // We don't need the full page, just the <head>
  46. $html = new DOMDocument();
  47. $html->strictErrorChecking = false;
  48. $success = @$html->loadHTML($content);
  49. if($success === false) {
  50. continue;
  51. }
  52. $xml = simplexml_import_dom($html);
  53. if($xml === false) {
  54. continue;
  55. }
  56. // Try to fetch the favicon URL from the <head> tag
  57. foreach($xml->head->children() as $head_tag) {
  58. if($head_tag->getName() != 'link') {
  59. continue;
  60. }
  61. $go_next_tag = false;
  62. foreach($head_tag->attributes() as $key=>$attribute) {
  63. if($go_next_tag || $key != 'rel') {
  64. continue;
  65. }
  66. if(strstr((string) $attribute, 'icon')) {
  67. $tmp = $head_tag->attributes();
  68. if(isset($tmp['sizes'])) {
  69. $sizes = (string) $tmp['sizes'];
  70. }
  71. else {
  72. $sizes = '';
  73. }
  74. $favicons[$url][] = array(
  75. 'favicon_url'=>(string) $tmp['href'],
  76. 'sizes'=>$sizes
  77. );
  78. $go_next_tag = true;
  79. }
  80. }
  81. }
  82. }
  83. // Add to errors the URLs without any favicons associated
  84. $favicons_keys = array_keys($favicons);
  85. foreach($contents['results'] as $url=>$content) {
  86. if(!in_array($url, $favicons_keys)) {
  87. $errors[] = $url;
  88. }
  89. }
  90. // Check for errorred feeds wether the favicon.ico file at the root exists
  91. $second_try = array();
  92. foreach ($errors as $url) {
  93. $parsed_url = parse_url(trim($url));
  94. $second_try_url = "";
  95. if(isset($parsed_url['scheme'])) {
  96. $second_try_url .= $parsed_url['scheme'].'://';
  97. }
  98. if(isset($parsed_url['host'])) {
  99. $second_try_url .= $parsed_url['host'];
  100. }
  101. if(isset($parsed_url['port'])) {
  102. $second_try_url .= $parsed_url['port'];
  103. }
  104. if(isset($parsed_url['user'])) {
  105. $second_try_url .= $parsed_url['user'];
  106. }
  107. if(isset($parsed_url['pass'])) {
  108. $second_try_url .= $parsed_url['pass'];
  109. }
  110. $second_try[] = array(
  111. 'input_url'=>$url,
  112. 'url'=>$second_try_url . '/favicon.ico'
  113. );
  114. }
  115. $second_try_curl = curl_downloader($second_try, false);
  116. $errors = array();
  117. foreach($second_try as $tested_url) {
  118. $status_code = (int) $second_try_curl['status_codes'][$tested_url['url']];
  119. if ($status_code >= 200 && $status_code < 400) {
  120. $favicons[$tested_url['input_url']][] = array(
  121. 'favicon_url'=>$tested_url['url'],
  122. 'sizes'=>''
  123. );
  124. }
  125. else {
  126. $errors[] = $tested_url['input_url'];
  127. }
  128. }
  129. return array('favicons'=>$favicons, 'errors'=>$errors);
  130. }
  131. /**
  132. * Downloads all the urls in the array $urls and returns an array with the results and the http status_codes.
  133. *
  134. * Mostly inspired by blogotext by timovn : https://github.com/timovn/blogotext/blob/master/inc/fich.php
  135. *
  136. * @todo If open_basedir or safe_mode, Curl will not follow redirections :
  137. * https://stackoverflow.com/questions/24687145/curlopt-followlocation-and-curl-multi-and-safe-mode
  138. *
  139. * @param an array $urls of associative arrays {'url', 'post'} for each URL. 'post' is a JSON array of data to send _via_ POST.
  140. * @return an array {'results', 'status_code'}, results being an array of the retrieved contents, indexed by URLs, and 'status_codes' being an array of status_code, indexed by URL.
  141. */
  142. function curl_downloader($urls, $fetch_content=true) {
  143. $chunks = array_chunk($urls, 40, true); // Chunks of 40 urls because curl has problems with too big "multi" requests
  144. $results = array();
  145. $status_codes = array();
  146. if (ini_get('open_basedir') == '' && ini_get('safe_mode') === false) { // Disable followlocation option if this is activated, to avoid warnings
  147. $follow_redirect = true;
  148. }
  149. else {
  150. $follow_redirect = false;
  151. }
  152. foreach ($chunks as $chunk) {
  153. $multihandler = curl_multi_init();
  154. $handlers = array();
  155. $total_feed_chunk = count($chunk) + count($results);
  156. foreach ($chunk as $i=>$url_array) {
  157. $url = $url_array['url'];
  158. set_time_limit(20); // Reset max execution time
  159. $handlers[$i] = curl_init($url);
  160. curl_setopt_array($handlers[$i], array(
  161. CURLOPT_RETURNTRANSFER => TRUE,
  162. CURLOPT_CONNECTTIMEOUT => 10,
  163. CURLOPT_TIMEOUT => 15,
  164. CURLOPT_FOLLOWLOCATION => $follow_redirect,
  165. CURLOPT_MAXREDIRS => 5,
  166. CURLOPT_USERAGENT => $_SERVER['HTTP_USER_AGENT'], // Add a user agent to prevent problems with some feeds
  167. CURLOPT_HEADER => $fetch_content ? FALSE : TRUE,
  168. CURLOPT_NOBODY => $fetch_content ? FALSE : TRUE,
  169. ));
  170. if (!empty($url_array['post'])) {
  171. curl_setopt($handlers[$i], CURLOPT_POST, true);
  172. curl_setopt($handlers[$i], CURLOPT_POSTFIELDS, json_decode($url_array['post'], true));
  173. }
  174. curl_multi_add_handle($multihandler, $handlers[$i]);
  175. }
  176. do {
  177. curl_multi_exec($multihandler, $active);
  178. curl_multi_select($multihandler);
  179. } while ($active > 0);
  180. foreach ($chunk as $i=>$url_array) {
  181. $url = $url_array['url'];
  182. $results[$url] = curl_multi_getcontent($handlers[$i]);
  183. $status_codes[$url] = curl_getinfo($handlers[$i], CURLINFO_HTTP_CODE);
  184. curl_multi_remove_handle($multihandler, $handlers[$i]);
  185. curl_close($handlers[$i]);
  186. }
  187. curl_multi_close($multihandler);
  188. }
  189. return array('results'=>$results, 'status_codes'=>$status_codes);
  190. }
  191. /**
  192. * Check that $haystack ends with $needle.
  193. */
  194. function endswith($haystack, $needle) {
  195. $length = strlen($needle);
  196. if ($length == 0) {
  197. return true;
  198. }
  199. return (substr($haystack, -$length) === $needle);
  200. }