方法一:使用 cURL(推荐,功能最强大)
<?php
/**
* 使用cURL抓取完整网页
* @param string $url 目标网址
* @param array $options 可选配置
* @return array 返回抓取结果
*/
function fetchWebpageWithCurl($url, $options = []) {
// 初始化cURL
$ch = curl_init();
// 默认配置
$defaultOptions = [
CURLOPT_URL => $url,
CURLOPT_RETURNTRANSFER => true, // 返回内容而不是直接输出
CURLOPT_FOLLOWLOCATION => true, // 跟随重定向
CURLOPT_MAXREDIRS => 5, // 最大重定向次数
CURLOPT_TIMEOUT => 30, // 超时时间(秒)
CURLOPT_SSL_VERIFYPEER => false, // 跳过SSL验证(仅测试用,生产环境建议开启)
CURLOPT_SSL_VERIFYHOST => false, // 跳过SSL主机验证
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
CURLOPT_HEADER => true, // 返回header信息
CURLOPT_ENCODING => '', // 接受所有编码
];
// 合并自定义选项
$finalOptions = $defaultOptions;
if (!empty($options)) {
foreach ($options as $key => $value) {
$finalOptions[$key] = $value;
}
}
// 设置cURL选项
curl_setopt_array($ch, $finalOptions);
// 执行请求
$response = curl_exec($ch);
// 检查是否有错误
if (curl_error($ch)) {
$error = curl_error($ch);
curl_close($ch);
return [
'success' => false,
'error' => $error
];
}
// 获取HTTP状态码
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
// 分离header和body
$headerSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
$header = substr($response, 0, $headerSize);
$body = substr($response, $headerSize);
curl_close($ch);
return [
'success' => true,
'http_code' => $httpCode,
'header' => $header,
'body' => $body,
'url' => $url
];
}
// 使用示例
$url = 'https://example.com';
$result = fetchWebpageWithCurl($url);
if ($result['success']) {
echo "抓取成功!\n";
echo "HTTP状态码:" . $result['http_code'] . "\n";
echo "网页内容长度:" . strlen($result['body']) . " 字节\n";
echo "网页内容预览:\n" . substr($result['body'], 0, 500) . "...\n";
// 保存到文件
file_put_contents('downloaded_page.html', $result['body']);
echo "网页已保存到 downloaded_page.html\n";
} else {
echo "抓取失败:" . $result['error'] . "\n";
}
?>方法二:使用 file_get_contents(简单但功能有限)
<?php
/**
* 使用file_get_contents抓取网页
* @param string $url 目标网址
* @return array 返回抓取结果
*/
function fetchWebpageWithFileGetContents($url) {
// 创建上下文配置
$options = [
'http' => [
'method' => 'GET',
'header' => [
'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language: zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Accept-Encoding: gzip, deflate',
'Connection: keep-alive',
],
'timeout' => 30,
'ignore_errors' => true
],
'ssl' => [
'verify_peer' => false,
'verify_peer_name' => false
]
];
$context = stream_context_create($options);
// 尝试抓取网页
try {
// 获取响应头信息
$headers = get_headers($url, 1);
$httpCode = intval(substr($headers[0], 9, 3));
// 获取网页内容
$content = file_get_contents($url, false, $context);
if ($content === false) {
return [
'success' => false,
'error' => '无法获取网页内容'
];
}
return [
'success' => true,
'http_code' => $httpCode,
'headers' => $headers,
'body' => $content,
'url' => $url
];
} catch (Exception $e) {
return [
'success' => false,
'error' => $e->getMessage()
];
}
}
// 使用示例
$url = 'https://example.com';
$result = fetchWebpageWithFileGetContents($url);
if ($result['success']) {
echo "抓取成功!\n";
echo "HTTP状态码:" . $result['http_code'] . "\n";
// 保存到文件
file_put_contents('downloaded_page_fg.html', $result['body']);
echo "网页已保存到 downloaded_page_fg.html\n";
} else {
echo "抓取失败:" . $result['error'] . "\n";
}
?>方法三:增强版cURL抓取(支持更多功能)
<?php
/**
* 增强版网页抓取器
*/
class WebpageFetcher {
private $ch;
private $cookies = [];
private $headers = [];
/**
* 构造函数
*/
public function __construct() {
$this->ch = curl_init();
$this->setDefaultOptions();
}
/**
* 设置默认选项
*/
private function setDefaultOptions() {
$defaultOptions = [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_MAXREDIRS => 10,
CURLOPT_TIMEOUT => 30,
CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_SSL_VERIFYHOST => false,
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
CURLOPT_HEADER => true,
CURLOPT_ENCODING => '',
CURLOPT_VERBOSE => false,
];
curl_setopt_array($this->ch, $defaultOptions);
}
/**
* 设置自定义头信息
*/
public function setHeaders($headers) {
$this->headers = array_merge($this->headers, $headers);
curl_setopt($this->ch, CURLOPT_HTTPHEADER, $this->headers);
return $this;
}
/**
* 设置Cookie
*/
public function setCookies($cookies) {
$this->cookies = array_merge($this->cookies, $cookies);
$cookieString = '';
foreach ($this->cookies as $key => $value) {
$cookieString .= "$key=$value; ";
}
curl_setopt($this->ch, CURLOPT_COOKIE, $cookieString);
return $this;
}
/**
* 设置代理
*/
public function setProxy($proxy, $proxyType = CURLPROXY_HTTP) {
curl_setopt($this->ch, CURLOPT_PROXY, $proxy);
curl_setopt($this->ch, CURLOPT_PROXYTYPE, $proxyType);
return $this;
}
/**
* 设置认证信息
*/
public function setAuth($username, $password, $type = CURLAUTH_BASIC) {
curl_setopt($this->ch, CURLOPT_HTTPAUTH, $type);
curl_setopt($this->ch, CURLOPT_USERPWD, "$username:$password");
return $this;
}
/**
* 抓取网页
*/
public function fetch($url) {
curl_setopt($this->ch, CURLOPT_URL, $url);
$response = curl_exec($this->ch);
if (curl_error($this->ch)) {
return [
'success' => false,
'error' => curl_error($this->ch)
];
}
$httpCode = curl_getinfo($this->ch, CURLINFO_HTTP_CODE);
$headerSize = curl_getinfo($this->ch, CURLINFO_HEADER_SIZE);
$header = substr($response, 0, $headerSize);
$body = substr($response, $headerSize);
// 解析响应头
$headers = $this->parseHeaders($header);
return [
'success' => true,
'http_code' => $httpCode,
'headers' => $headers,
'body' => $body,
'url' => $url,
'info' => curl_getinfo($this->ch)
];
}
/**
* 解析响应头
*/
private function parseHeaders($headerString) {
$headers = [];
$lines = explode("\n", trim($headerString));
foreach ($lines as $line) {
if (strpos($line, ':') !== false) {
list($key, $value) = explode(':', $line, 2);
$headers[trim($key)] = trim($value);
}
}
return $headers;
}
/**
* 析构函数
*/
public function __destruct() {
if ($this->ch) {
curl_close($this->ch);
}
}
/**
* 静态方法:快速抓取
*/
public static function quickFetch($url) {
$fetcher = new self();
return $fetcher->fetch($url);
}
}
// 使用示例
$fetcher = new WebpageFetcher();
// 设置自定义选项
$fetcher->setHeaders([
'Accept-Language: zh-CN,zh;q=0.9',
'Cache-Control: no-cache'
])->setCookies([
'session_id' => '123456',
'user_pref' => 'dark_mode'
]);
// 抓取网页
$result = $fetcher->fetch('https://example.com');
if ($result['success']) {
echo "抓取成功!\n";
echo "HTTP状态码:" . $result['http_code'] . "\n";
echo "Content-Type: " . ($result['headers']['Content-Type'] ?? '未知') . "\n";
echo "网页大小:" . strlen($result['body']) . " 字节\n";
// 保存到文件
$filename = 'webpage_' . date('Ymd_His') . '.html';
file_put_contents($filename, $result['body']);
echo "网页已保存到: $filename\n";
// 提取网页标题
if (preg_match('/<title>(.*?)<\/title>/i', $result['body'], $matches)) {
echo "网页标题: " . $matches[1] . "\n";
}
} else {
echo "抓取失败:" . $result['error'] . "\n";
}
// 使用静态方法快速抓取
$quickResult = WebpageFetcher::quickFetch('https://example.com');
?>使用说明
选择合适的方案:
- 如果需要处理复杂的网页、需要设置代理、处理Cookie等,使用方法三(增强版)
- 如果只需要简单抓取,使用方法一(cURL)
- 如果环境不支持cURL,可以使用方法二(file_get_contents)
注意事项:
- 确保PHP开启了相应的扩展(cURL、openssl等)
- 遵守网站的robots.txt规则
- 不要过于频繁地抓取同一网站
- 注意处理编码问题
- 生产环境建议开启SSL验证
常见问题处理:
- 如果遇到编码问题,可以使用
mb_convert_encoding()进行转换 - 如果需要处理JavaScript渲染的页面,可能需要使用无头浏览器(如Puppeteer)
- 对于大文件下载,建议使用流式处理而不是一次性加载到内存
- 如果遇到编码问题,可以使用
The static quickFetch method is a nice convenience. Sometimes you just need to grab a page quickly without instantiating the whole class. Good design choice to include both flexible and simple interfaces.
实测这个类抓取https://www.douban.com没问题,但遇到那些反爬虫严格的网站,比如淘宝京东,还是会被封。建议在文档里加上设置延时、轮换User-Agent的建议,毕竟君子协定还是要遵守的。
This is a great resource for beginners learning web scraping. The progression from simple cURL to the full class shows good educational design. The comments in Chinese and English code make it accessible to a wider audience.
有个小建议:方法一里CURLOPT_FOLLOWLOCATION设为了true,但CURLOPT_MAXREDIRS只有5次。有些网站会搞无限重定向,5次可能不够。建议根据实际情况调整,或者做成可配置参数。
The header parsing function is simple but effective. I've seen so many over-engineered header parsers that try to handle every RFC edge case. This one does exactly what 99% of use cases need. KISS principle in action.