第一步:创建项目
bash
cargo new web-crawler
cd web-crawlerCargo.toml
toml
[package]
name = "web-crawler"
version = "0.1.0"
edition = "2021"
[dependencies]
tokio = { version = "1", features = ["full"] }
reqwest = { version = "0.11", features = ["json"] }
scraper = "0.18" # HTML 解析
url = "2"
select = "0.6" # CSS 选择器
serde = { version = "1", features = ["derive"] }
serde_json = "1"
toml = "0.8"
anyhow = "1"
tracing = "0.1"
tracing-subscriber = "0.3"
chrono = { version = "0.4", features = ["serde"] }
futures = "0.3"
async-channel = "2"第二步:配置模块
config.toml
toml
[crawler]
max_depth = 3
max_concurrent = 10
timeout_seconds = 30
user_agent = "RustCrawler/0.1"
[storage]
output_dir = "./output"
save_html = false
save_text = truesrc/config.rs
rust
▶ Runuse serde::Deserialize;
#[derive(Debug, Deserialize)]
pub struct Config {
pub crawler: CrawlerConfig,
pub storage: StorageConfig,
}
#[derive(Debug, Deserialize)]
pub struct CrawlerConfig {
pub max_depth: usize,
pub max_concurrent: usize,
pub timeout_seconds: u64,
pub user_agent: String,
}
#[derive(Debug, Deserialize)]
pub struct StorageConfig {
pub output_dir: String,
pub save_html: bool,
pub save_text: bool,
}
impl Config {
pub fn load() -> anyhow::Result<Self> {
let content = std::fs::read_to_string("config.toml")?;
let config: Config = toml::from_str(&content)?;
Ok(config)
}
}第三步:网页获取
src/crawler/fetcher.rs
rust
▶ Runuse reqwest::Client;
use std::time::Duration;
use crate::config::CrawlerConfig;
/// 网页获取器
pub struct Fetcher {
client: Client,
}
impl Fetcher {
/// 创建获取器
pub fn new(config: &CrawlerConfig) -> Self {
let client = Client::builder()
.timeout(Duration::from_secs(config.timeout_seconds))
.user_agent(&config.user_agent)
.build()
.unwrap();
Self { client }
}
/// 获取网页内容
pub async fn fetch(&self, url: &str) -> anyhow::Result<String> {
let response = self.client
.get(url)
.send()
.await?;
let html = response.text().await?;
Ok(html)
}
/// 获取网页并返回响应信息
pub async fn fetch_with_info(&self, url: &str) -> anyhow::Result<FetchResult> {
let response = self.client
.get(url)
.send()
.await?;
let status = response.status().as_u16();
let headers = response.headers().clone();
let html = response.text().await?;
Ok(FetchResult {
url: url.to_string(),
status,
headers,
html,
})
}
}
/// 获取结果
pub struct FetchResult {
pub url: String,
pub status: u16,
pub headers: reqwest::header::HeaderMap,
pub html: String,
}第四步:HTML 解析
src/crawler/parser.rs
rust
▶ Runuse scraper::{Html, Selector};
/// HTML 解析器
pub struct HtmlParser {
document: Html,
}
impl HtmlParser {
/// 从 HTML 字符串创建
pub fn from_html(html: &str) -> Self {
let document = Html::parse_document(html);
Self { document }
}
/// 提取标题
pub fn title(&self) -> Option<String> {
let selector = Selector::parse("title").unwrap();
self.document
.select(&selector)
.next()
.map(|el| el.text().collect::<String>())
}
/// 提取所有链接
pub fn links(&self, base_url: &str) -> Vec<String> {
let selector = Selector::parse("a[href]").unwrap();
self.document
.select(&selector)
.filter_map(|el| {
let href = el.value().attr("href")?;
// 解析相对 URL
let url = url::Url::parse(base_url)
.ok()?
.join(href)
.ok()?;
// 只保留 HTTP/HTTPS 链接
if url.scheme() == "http" || url.scheme() == "https" {
Some(url.to_string())
} else {
None
}
})
.collect()
}
/// 提取文本内容
pub fn text(&self) -> String {
let selector = Selector::parse("body").unwrap();
self.document
.select(&selector)
.next()
.map(|el| {
el.text()
.collect::<Vec<_>>()
.join(" ")
.split_whitespace()
.collect::<Vec<_>>()
.join(" ")
})
.unwrap_or_default()
}
/// 提取所有图片
pub fn images(&self, base_url: &str) -> Vec<String> {
let selector = Selector::parse("img[src]").unwrap();
self.document
.select(&selector)
.filter_map(|el| {
let src = el.value().attr("src")?;
url::Url::parse(base_url)
.ok()?
.join(src)
.ok()
.map(|u| u.to_string())
})
.collect()
}
/// 使用 CSS 选择器提取元素
pub fn select(&self, selector: &str) -> Vec<String> {
let sel = Selector::parse(selector).unwrap();
self.document
.select(&sel)
.map(|el| el.text().collect::<String>())
.collect()
}
/// 提取 meta 信息
pub fn meta(&self) -> std::collections::HashMap<String, String> {
let selector = Selector::parse("meta[name][content]").unwrap();
self.document
.select(&selector)
.filter_map(|el| {
let name = el.value().attr("name")?;
let content = el.value().attr("content")?;
Some((name.to_string(), content.to_string()))
})
.collect()
}
}第五步:URL 队列
src/crawler/queue.rs
rust
▶ Runuse std::collections::{HashSet, VecDeque};
use std::sync::Arc;
use tokio::sync::Mutex;
/// URL 队列
pub struct UrlQueue {
/// 待处理队列
pending: Arc<Mutex<VecDeque<String>>>,
/// 已处理集合
visited: Arc<Mutex<HashSet<String>>>,
}
impl UrlQueue {
/// 创建新队列
pub fn new() -> Self {
Self {
pending: Arc::new(Mutex::new(VecDeque::new())),
visited: Arc::new(Mutex::new(HashSet::new())),
}
}
/// 添加 URL
pub async fn push(&self, url: String) {
let visited = self.visited.lock().await;
if !visited.contains(&url) {
let mut pending = self.pending.lock().await;
pending.push_back(url);
}
}
/// 批量添加 URL
pub async fn push_batch(&self, urls: Vec<String>) {
for url in urls {
self.push(url).await;
}
}
/// 获取下一个 URL
pub async fn pop(&self) -> Option<String> {
let mut pending = self.pending.lock().await;
if let Some(url) = pending.pop_front() {
let mut visited = self.visited.lock().await;
visited.insert(url.clone());
Some(url)
} else {
None
}
}
/// 检查是否为空
pub async fn is_empty(&self) -> bool {
let pending = self.pending.lock().await;
pending.is_empty()
}
/// 获取队列大小
pub async fn size(&self) -> usize {
let pending = self.pending.lock().await;
pending.len()
}
/// 获取已处理数量
pub async fn visited_count(&self) -> usize {
let visited = self.visited.lock().await;
visited.len()
}
}
impl Default for UrlQueue {
fn default() -> Self {
Self::new()
}
}