import subprocess
import sys
print(“📦 Putting in system dependencies…”)
subprocess.run([‘apt-get’, ‘update’, ‘-qq’], capture_output=True)
subprocess.run([‘apt-get’, ‘install’, ‘-y’, ‘-qq’,
‘libnss3’, ‘libnspr4’, ‘libatk1.0-0’, ‘libatk-bridge2.0-0’,
‘libcups2’, ‘libdrm2’, ‘libxkbcommon0’, ‘libxcomposite1’,
‘libxdamage1’, ‘libxfixes3’, ‘libxrandr2’, ‘libgbm1’,
‘libasound2’, ‘libpango-1.0-0’, ‘libcairo2’], capture_output=True)
print(“✅ System dependencies put in!”)
print(“n📦 Putting in Python packages…”)
subprocess.run([sys.executable, ‘-m’, ‘pip’, ‘install’, ‘-U’, ‘crawl4ai’, ‘nest_asyncio’, ‘pydantic’, ‘-q’])
print(“✅ Python packages put in!”)
print(“n📦 Putting in Playwright browsers (this will take a minute)…”)
subprocess.run([sys.executable, ‘-m’, ‘playwright’, ‘install’, ‘chromium’], capture_output=True)
subprocess.run([sys.executable, ‘-m’, ‘playwright’, ‘install-deps’, ‘chromium’], capture_output=True)
print(“✅ Playwright browsers put in!”)
import nest_asyncio
nest_asyncio.apply()
import asyncio
import json
from typing import Checklist, Non-obligatory
from pydantic import BaseModel, Area
print(“n” + “=”*60)
print(“✅ INSTALLATION COMPLETE! Able to crawl!”)
print(“=”*60)
print(“n” + “=”*60)
print(“📖 PART 2: BASIC CRAWLING”)
print(“=”*60)
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
async def basic_crawl():
“””The best potential crawl – fetch a webpage and get markdown.”””
print(“n🔍 Operating fundamental crawl on instance.com…”)
async with AsyncWebCrawler() as crawler:
consequence = await crawler.arun(url=”https://instance.com”)
print(f”n✅ Crawl profitable: {consequence.success}”)
print(f”📄 Title: {consequence.metadata.get(‘title’, ‘N/A’)}”)
print(f”📝 Markdown size: {len(consequence.markdown.raw_markdown)} characters”)
print(f”n— First 500 chars of markdown —“)
print(consequence.markdown.raw_markdown[:500])
return consequence
consequence = asyncio.run(basic_crawl())
print(“n” + “=”*60)
print(“⚙️ PART 3: CONFIGURED CRAWLING”)
print(“=”*60)
async def configured_crawl():
“””Crawling with customized browser and crawler configurations.”””
print(“n🔧 Operating configured crawl with customized settings…”)
browser_config = BrowserConfig(
headless=True,
verbose=True,
viewport_width=1920,
viewport_height=1080,
user_agent=”Mozilla/5.0 (Home windows NT 10.0; Win64; x64) AppleWebKit/537.36″
)
run_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
word_count_threshold=10,
page_timeout=30000,
wait_until=”networkidle”,
verbose=True
)
async with AsyncWebCrawler(config=browser_config) as crawler:
consequence = await crawler.arun(
url=”https://httpbin.org/html”,
config=run_config
)
print(f”n✅ Success: {consequence.success}”)
print(f”📊 Standing code: {consequence.status_code}”)
print(f”n— Content material Preview —“)
print(consequence.markdown.raw_markdown[:400])
return consequence
consequence = asyncio.run(configured_crawl())
print(“n” + “=”*60)
print(“📝 PART 4: MARKDOWN GENERATION”)
print(“=”*60)
from crawl4ai.content_filter_strategy import PruningContentFilter, BM25ContentFilter
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
async def markdown_generation_demo():
“””Demonstrates uncooked vs match markdown with content material filtering.”””
print(“n🎯 Demonstrating markdown technology methods…”)
browser_config = BrowserConfig(headless=True, verbose=False)
run_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
markdown_generator=DefaultMarkdownGenerator(
content_filter=PruningContentFilter(
threshold=0.4,
threshold_type=”mounted”,
min_word_threshold=20
)
)
)
async with AsyncWebCrawler(config=browser_config) as crawler:
consequence = await crawler.arun(
url=”https://en.wikipedia.org/wiki/Web_scraping”,
config=run_config
)
raw_len = len(consequence.markdown.raw_markdown)
fit_len = len(consequence.markdown.fit_markdown) if consequence.markdown.fit_markdown else 0
print(f”n📊 Markdown Comparability:”)
print(f” Uncooked Markdown: {raw_len:,} characters”)
print(f” Match Markdown: {fit_len:,} characters”)
print(f” Discount: {((raw_len – fit_len) / raw_len * 100):.1f}%”)
print(f”n— Match Markdown Preview (first 600 chars) —“)
print(consequence.markdown.fit_markdown[:600] if consequence.markdown.fit_markdown else “N/A”)
return consequence
consequence = asyncio.run(markdown_generation_demo())

