Python Integration
Complete guide for integrating ScrapeHub with Python applications
Installation
Terminal# Install via pip pip install scrapehub-python # Or with additional dependencies pip install scrapehub-python[async]
Quick Start
basic_usage.py
from scrapehub import ScrapeHubClient
# Initialize client
client = ScrapeHubClient(api_key="sk_live_xxxx_449x")
# Simple scrape
result = client.scrape(
url="https://example.com/products",
engine="neural-x1"
)
print(f"Extracted {len(result.data)} items")
for item in result.data:
print(item)Advanced Configuration
advanced.py
from scrapehub import ScrapeHubClient, ScrapeConfig
client = ScrapeHubClient(
api_key="sk_live_xxxx_449x",
timeout=300, # 5 minutes
retries=3
)
# Advanced scraping with custom config
config = ScrapeConfig(
url="https://example.com/products",
engine="neural-x1",
format="json",
# Pagination
pagination={
"enabled": True,
"max_pages": 10,
"selector": "a.next-page"
},
# Custom headers
headers={
"User-Agent": "Mozilla/5.0...",
"Accept-Language": "en-US,en;q=0.9"
},
# JavaScript rendering
render_js=True,
wait_for_selector=".product-list",
# Proxy settings
proxy={
"enabled": True,
"region": "us-east",
"residential": True
},
# Rate limiting
rate_limit={
"requests_per_second": 2,
"delay_between_pages": 1000 # ms
}
)
result = client.scrape_with_config(config)
# Check status
print(f"Status: {result.status}")
print(f"Pages scraped: {result.pages_scraped}")
print(f"Items extracted: {len(result.data)}")
print(f"Time taken: {result.duration}s")Async/Await Support
async_scraping.py
import asyncio
from scrapehub import AsyncScrapeHubClient
async def scrape_multiple_urls():
client = AsyncScrapeHubClient(api_key="sk_live_xxxx_449x")
urls = [
"https://example.com/category/1",
"https://example.com/category/2",
"https://example.com/category/3"
]
# Scrape all URLs concurrently
tasks = [client.scrape(url) for url in urls]
results = await asyncio.gather(*tasks)
# Process results
all_items = []
for result in results:
all_items.extend(result.data)
print(f"Total items extracted: {len(all_items)}")
return all_items
# Run async function
asyncio.run(scrape_multiple_urls())Job Management
job_management.py
from scrapehub import ScrapeHubClient
import time
client = ScrapeHubClient(api_key="sk_live_xxxx_449x")
# Start a scrape job (non-blocking)
job = client.create_job(
url="https://example.com/large-dataset",
engine="neural-x1"
)
print(f"Job ID: {job.id}")
print(f"Status: {job.status}")
# Poll job status
while not job.is_complete():
job.refresh()
print(f"Progress: {job.progress}%")
time.sleep(5)
# Get results when complete
if job.is_successful():
results = job.get_results()
print(f"Extracted {len(results)} items")
else:
print(f"Job failed: {job.error_message}")
# List all jobs
jobs = client.list_jobs(limit=10, status="completed")
for job in jobs:
print(f"{job.id}: {job.status} - {job.created_at}")Data Export
export_data.py
from scrapehub import ScrapeHubClient
import pandas as pd
import json
client = ScrapeHubClient(api_key="sk_live_xxxx_449x")
# Get job results
job = client.get_job("job_abc123")
data = job.get_results()
# Export to JSON
with open('results.json', 'w') as f:
json.dump(data, f, indent=2)
# Export to CSV using pandas
df = pd.DataFrame(data)
df.to_csv('results.csv', index=False)
# Export to Excel
df.to_excel('results.xlsx', index=False)
# Export to Parquet (compressed)
df.to_parquet('results.parquet', compression='snappy')
# Or use built-in export
job.export_to_file('results.json', format='json')
job.export_to_file('results.csv', format='csv')Error Handling
error_handling.py
from scrapehub import ScrapeHubClient, ScrapeHubError
from scrapehub.exceptions import (
AuthenticationError,
RateLimitError,
InvalidRequestError,
ScraperError
)
client = ScrapeHubClient(api_key="sk_live_xxxx_449x")
try:
result = client.scrape("https://example.com")
except AuthenticationError as e:
print(f"Authentication failed: {e}")
print("Check your API key")
except RateLimitError as e:
print(f"Rate limit exceeded: {e}")
print(f"Retry after: {e.retry_after} seconds")
except InvalidRequestError as e:
print(f"Invalid request: {e}")
print(f"Error details: {e.details}")
except ScraperError as e:
print(f"Scraper failed: {e}")
print(f"Target URL: {e.url}")
print(f"Error code: {e.code}")
except ScrapeHubError as e:
print(f"General error: {e}")
except Exception as e:
print(f"Unexpected error: {e}")Webhooks
webhooks.py
from scrapehub import ScrapeHubClient
from flask import Flask, request, jsonify
client = ScrapeHubClient(api_key="sk_live_xxxx_449x")
app = Flask(__name__)
# Create a job with webhook
job = client.create_job(
url="https://example.com/products",
engine="neural-x1",
webhook_url="https://your-server.com/webhook",
webhook_events=["job.completed", "job.failed"]
)
# Webhook endpoint
@app.route('/webhook', methods=['POST'])
def handle_webhook():
data = request.json
event_type = data['event']
job_id = data['job_id']
if event_type == 'job.completed':
print(f"Job {job_id} completed!")
# Process results
job = client.get_job(job_id)
results = job.get_results()
# ... do something with results
elif event_type == 'job.failed':
print(f"Job {job_id} failed!")
# Handle failure
return jsonify({"status": "received"}), 200
if __name__ == '__main__':
app.run(port=5000)Integration with Pandas
pandas_integration.py
from scrapehub import ScrapeHubClient
import pandas as pd
client = ScrapeHubClient(api_key="sk_live_xxxx_449x")
# Scrape and convert to DataFrame
result = client.scrape("https://example.com/products")
df = pd.DataFrame(result.data)
# Data cleaning
df = df.drop_duplicates(subset=['product_id'])
df['price'] = df['price'].str.replace('$', '').astype(float)
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
# Filter and sort
top_products = df[df['rating'] > 4.5].sort_values('price', ascending=False)
# Aggregate data
category_stats = df.groupby('category').agg({
'price': ['mean', 'min', 'max'],
'rating': 'mean',
'product_id': 'count'
})
print(category_stats)
# Export
df.to_csv('cleaned_products.csv', index=False)Examples Repository
Check out our GitHub repository for more Python examples and use cases:
Browse Examples →