On https://huyenchip.com/llama-police, Chip Huyen maintains an up-to-date analysis of the open-source AI ecosystem, with a special focus on tools and repositories built around foundation models.
I really enjoy exploring these tools and try to get hands-on with them. Every week, I pick one and build a small toy project to better understand how it works. Each project teaches me something new about AI and large language models (LLMs).
Last week's tool was Crawler4AI. I wanted to create a toy project using it, and chose FactBook as the website to crawl.
Let me show you what I did in more detail!
The first thing I did was set up a Python virtual environment and install crawler4ai
using:
pip install crawl4ai
Then I ran:
crawl4ai-setup
This command installs or updates the required Playwright browsers that crawler4ai
depends on.
To test the setup, I ran a basic crawl example:
import asyncio
from crawl4ai import AsyncWebCrawler
async def main():
# Create an instance of AsyncWebCrawler
async with AsyncWebCrawler() as crawler:
# Run the crawler on a URL
result = await crawler.arun(url="https://crawl4ai.com")
# Print the extracted content
print(result.markdown)
# Run the async main function
asyncio.run(main())
Now let's move on to something more interesting—crawling the CIA World Factbook. Here's a simple deep crawling example:
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
async def main():
config = CrawlerRunConfig(
deep_crawl_strategy=BFSDeepCrawlStrategy(
max_depth=2,
include_external=False
),
scraping_strategy=LXMLWebScrapingStrategy(),
verbose=True
)
async with AsyncWebCrawler() as crawler:
results = await crawler.arun("https://www.cia.gov/the-world-factbook/", config=config)
print(f"Crawled {len(results)} pages in total")
for result in results[:3]:
print(f"URL: {result.url}")
print(f"Depth: {result.metadata.get('depth', 0)}")
if __name__ == "__main__":
asyncio.run(main())
To save the results, I wrote a helper function that creates a safe filename from a URL:
import re
from urllib.parse import urlparse
def filename_from_url(url, extension=""):
parsed = urlparse(url)
base = parsed.netloc + parsed.path
base = base.rstrip('/')
base = re.sub(r'[<>:"/\\|?*]', '_', base)
if parsed.query or parsed.fragment:
extra = parsed.query + parsed.fragment
extra = re.sub(r'[<>:"/\\|?*]', '_', extra)
base += "_" + extra
if extension and not base.endswith(extension):
base += extension if extension.startswith('.') else '.' + extension
return base
To focus only on relevant pages and exclude unnecessary ones, I used filters in the crawl configuration:
'form'
, 'header'
, 'footer'
, 'nav'
, 'img'
URLPatternFilter
ContentTypeFilter
Here’s the complete version that crawls the World Factbook, filters pages, and saves the results as Markdown files:
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, FilterChain, URLPatternFilter, ContentTypeFilter
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
import os
import re
from urllib.parse import urlparse
def filename_from_url(url, extension=""):
parsed = urlparse(url)
# Combine netloc and path
base = parsed.netloc + parsed.path
# Remove trailing slashes and unsafe characters
base = base.rstrip('/')
base = re.sub(r'[<>:"/\\|?*]', '_', base) # Windows-safe characters
# Add query or fragment hash if needed
if parsed.query or parsed.fragment:
extra = parsed.query + parsed.fragment
extra = re.sub(r'[<>:"/\\|?*]', '_', extra)
base += "_" + extra
# Add optional file extension
if extension and not base.endswith(extension):
base += extension if extension.startswith('.') else '.' + extension
return base
filter_chain = FilterChain(
[
URLPatternFilter(patterns=["*the-world-factbook*"]),
URLPatternFilter(patterns=["*archives*"],reverse=True),
ContentTypeFilter(allowed_types=["text/html"])
]
)
async def main():
# Configure a 2-level deep crawl
config = CrawlerRunConfig(
deep_crawl_strategy=BFSDeepCrawlStrategy(
max_depth=200000,
include_external=False,
filter_chain=filter_chain
),
scraping_strategy=LXMLWebScrapingStrategy(),
verbose=True,
# Tag exclusions
excluded_tags=['form', 'header', 'footer', 'nav', 'img'],
# Link filtering
exclude_external_links=True,
exclude_social_media_links=True,
exclude_external_images=True,
wait_for_images=False,
)
async with AsyncWebCrawler() as crawler:
results = await crawler.arun("https://www.cia.gov/the-world-factbook/", config=config)
print(f"Crawled {len(results)} pages in total")
# Access individual results
for result in results:
filepath = filename_from_url(result.url,"md")
print(f"URL: {result.url}")
print(f"Depth: {result.metadata.get('depth', 0)}")
with open(filepath, "w", encoding="utf-8") as f:
f.write(result.markdown)
if __name__ == "__main__":
asyncio.run(main())
To learn more, refer to https://huyenchip.com/2024/03/14/ai-oss.html.