Ruby Integration
Integrate ScrapeHub into your Ruby applications
Installation
Terminal# Install via gem gem install scrapehub # Or add to Gemfile gem 'scrapehub'
Quick Start
basic_usage.rb
require 'scrapehub'
# Initialize client
client = ScrapeHub::Client.new(api_key: 'sk_live_xxxx_449x')
# Simple scrape
result = client.scrape(
url: 'https://example.com/products',
engine: 'neural-x1'
)
puts "Extracted #{result.data.length} items"
result.data.each do |item|
puts item
endAdvanced Configuration
advanced.rb
require 'scrapehub'
client = ScrapeHub::Client.new(
api_key: 'sk_live_xxxx_449x',
timeout: 300, # 5 minutes
retries: 3
)
# Advanced scraping with custom config
result = client.scrape(
url: 'https://example.com/products',
engine: 'neural-x1',
format: 'json',
# Pagination
pagination: {
enabled: true,
max_pages: 10,
selector: 'a.next-page'
},
# Custom headers
headers: {
'User-Agent': 'Mozilla/5.0...',
'Accept-Language': 'en-US,en;q=0.9'
},
# JavaScript rendering
render_js: true,
wait_for_selector: '.product-list',
# Proxy settings
proxy: {
enabled: true,
region: 'us-east',
residential: true
}
)
puts "Status: #{result.status}"
puts "Pages scraped: #{result.pages_scraped}"
puts "Items extracted: #{result.data.length}"
puts "Time taken: #{result.duration}s"Async/Concurrent Processing
concurrent.rb
require 'scrapehub'
require 'concurrent'
client = ScrapeHub::Client.new(api_key: 'sk_live_xxxx_449x')
urls = [
'https://example.com/category/1',
'https://example.com/category/2',
'https://example.com/category/3'
]
# Scrape all URLs concurrently
promises = urls.map do |url|
Concurrent::Promise.execute do
client.scrape(url: url, engine: 'neural-x1')
end
end
# Wait for all to complete
results = promises.map(&:value)
# Process results
all_items = results.flat_map(&:data)
puts "Total items extracted: #{all_items.length}"Job Management
job_management.rb
require 'scrapehub'
client = ScrapeHub::Client.new(api_key: 'sk_live_xxxx_449x')
# Start a scrape job (non-blocking)
job = client.create_job(
url: 'https://example.com/large-dataset',
engine: 'neural-x1'
)
puts "Job ID: #{job.id}"
puts "Status: #{job.status}"
# Poll job status
until job.complete?
job.refresh
puts "Progress: #{job.progress}%"
sleep 5
end
# Get results when complete
if job.successful?
results = job.results
puts "Extracted #{results.length} items"
else
puts "Job failed: #{job.error_message}"
end
# List all jobs
jobs = client.list_jobs(limit: 10, status: 'completed')
jobs.each do |job|
puts "#{job.id}: #{job.status} - #{job.created_at}"
endRails Integration
app/services/scraper_service.rb
class ScraperService
def initialize
@client = ScrapeHub::Client.new(
api_key: Rails.application.credentials.scrapehub_api_key
)
end
def scrape_products(url)
result = @client.scrape(
url: url,
engine: 'neural-x1',
format: 'json'
)
# Save to database
result.data.each do |product_data|
Product.create!(
name: product_data['name'],
price: product_data['price'],
url: product_data['url']
)
end
result.data
rescue ScrapeHub::Error => e
Rails.logger.error "Scraping failed: #{e.message}"
raise
end
endBackground Jobs with Sidekiq
app/jobs/scrape_job.rb
class ScrapeJob
include Sidekiq::Job
def perform(url, user_id)
client = ScrapeHub::Client.new(
api_key: ENV['SCRAPEHUB_API_KEY']
)
# Create async job
job = client.create_job(
url: url,
engine: 'neural-x1',
webhook_url: Rails.application.routes.url_helpers.scrape_webhook_url
)
# Store job ID for tracking
ScrapeRecord.create!(
user_id: user_id,
job_id: job.id,
url: url,
status: 'pending'
)
end
end
# Controller
class ScraperController < ApplicationController
def create
ScrapeJob.perform_async(params[:url], current_user.id)
render json: { message: 'Scrape job started' }
end
def webhook
job_id = params[:job_id]
status = params[:status]
record = ScrapeRecord.find_by(job_id: job_id)
record.update!(status: status)
if status == 'completed'
# Process results
client = ScrapeHub::Client.new(api_key: ENV['SCRAPEHUB_API_KEY'])
job = client.get_job(job_id)
# ... handle results
end
head :ok
end
endData Export
export_data.rb
require 'scrapehub'
require 'csv'
require 'json'
client = ScrapeHub::Client.new(api_key: 'sk_live_xxxx_449x')
# Get job results
job = client.get_job('job_abc123')
data = job.results
# Export to JSON
File.open('results.json', 'w') do |f|
f.write(JSON.pretty_generate(data))
end
# Export to CSV
CSV.open('results.csv', 'w') do |csv|
# Write headers
csv << data.first.keys
# Write rows
data.each do |row|
csv << row.values
end
end
# Or use built-in export
job.export_to_file('results.json', format: 'json')
job.export_to_file('results.csv', format: 'csv')Error Handling
error_handling.rb
require 'scrapehub'
client = ScrapeHub::Client.new(api_key: 'sk_live_xxxx_449x')
begin
result = client.scrape('https://example.com')
rescue ScrapeHub::AuthenticationError => e
puts "Authentication failed: #{e.message}"
puts "Check your API key"
rescue ScrapeHub::RateLimitError => e
puts "Rate limit exceeded: #{e.message}"
puts "Retry after: #{e.retry_after} seconds"
sleep e.retry_after
retry
rescue ScrapeHub::InvalidRequestError => e
puts "Invalid request: #{e.message}"
puts "Error details: #{e.details}"
rescue ScrapeHub::ScraperError => e
puts "Scraper failed: #{e.message}"
puts "Target URL: #{e.url}"
puts "Error code: #{e.code}"
rescue ScrapeHub::Error => e
puts "General error: #{e.message}"
rescue StandardError => e
puts "Unexpected error: #{e.message}"
endTesting with RSpec
spec/services/scraper_service_spec.rb
require 'rails_helper'
RSpec.describe ScraperService do
let(:service) { described_class.new }
let(:mock_client) { instance_double(ScrapeHub::Client) }
before do
allow(ScrapeHub::Client).to receive(:new).and_return(mock_client)
end
describe '#scrape_products' do
let(:url) { 'https://example.com/products' }
let(:mock_result) do
double(
data: [
{ 'name' => 'Product 1', 'price' => 29.99, 'url' => 'https://...' },
{ 'name' => 'Product 2', 'price' => 39.99, 'url' => 'https://...' }
]
)
end
it 'scrapes products and saves to database' do
expect(mock_client).to receive(:scrape).with(
url: url,
engine: 'neural-x1',
format: 'json'
).and_return(mock_result)
expect {
service.scrape_products(url)
}.to change(Product, :count).by(2)
end
it 'handles errors gracefully' do
expect(mock_client).to receive(:scrape).and_raise(
ScrapeHub::ScraperError.new('Failed to scrape')
)
expect {
service.scrape_products(url)
}.to raise_error(ScrapeHub::ScraperError)
end
end
end