Ruby Integration

Integrate ScrapeHub into your Ruby applications

Installation

Terminal
# Install via gem gem install scrapehub # Or add to Gemfile gem 'scrapehub'

Quick Start

basic_usage.rb
ruby
require 'scrapehub'

# Initialize client
client = ScrapeHub::Client.new(api_key: 'sk_live_xxxx_449x')

# Simple scrape
result = client.scrape(
  url: 'https://example.com/products',
  engine: 'neural-x1'
)

puts "Extracted #{result.data.length} items"
result.data.each do |item|
  puts item
end

Advanced Configuration

advanced.rb
ruby
require 'scrapehub'

client = ScrapeHub::Client.new(
  api_key: 'sk_live_xxxx_449x',
  timeout: 300, # 5 minutes
  retries: 3
)

# Advanced scraping with custom config
result = client.scrape(
  url: 'https://example.com/products',
  engine: 'neural-x1',
  format: 'json',

  # Pagination
  pagination: {
    enabled: true,
    max_pages: 10,
    selector: 'a.next-page'
  },

  # Custom headers
  headers: {
    'User-Agent': 'Mozilla/5.0...',
    'Accept-Language': 'en-US,en;q=0.9'
  },

  # JavaScript rendering
  render_js: true,
  wait_for_selector: '.product-list',

  # Proxy settings
  proxy: {
    enabled: true,
    region: 'us-east',
    residential: true
  }
)

puts "Status: #{result.status}"
puts "Pages scraped: #{result.pages_scraped}"
puts "Items extracted: #{result.data.length}"
puts "Time taken: #{result.duration}s"

Async/Concurrent Processing

concurrent.rb
ruby
require 'scrapehub'
require 'concurrent'

client = ScrapeHub::Client.new(api_key: 'sk_live_xxxx_449x')

urls = [
  'https://example.com/category/1',
  'https://example.com/category/2',
  'https://example.com/category/3'
]

# Scrape all URLs concurrently
promises = urls.map do |url|
  Concurrent::Promise.execute do
    client.scrape(url: url, engine: 'neural-x1')
  end
end

# Wait for all to complete
results = promises.map(&:value)

# Process results
all_items = results.flat_map(&:data)
puts "Total items extracted: #{all_items.length}"

Job Management

job_management.rb
ruby
require 'scrapehub'

client = ScrapeHub::Client.new(api_key: 'sk_live_xxxx_449x')

# Start a scrape job (non-blocking)
job = client.create_job(
  url: 'https://example.com/large-dataset',
  engine: 'neural-x1'
)

puts "Job ID: #{job.id}"
puts "Status: #{job.status}"

# Poll job status
until job.complete?
  job.refresh
  puts "Progress: #{job.progress}%"
  sleep 5
end

# Get results when complete
if job.successful?
  results = job.results
  puts "Extracted #{results.length} items"
else
  puts "Job failed: #{job.error_message}"
end

# List all jobs
jobs = client.list_jobs(limit: 10, status: 'completed')
jobs.each do |job|
  puts "#{job.id}: #{job.status} - #{job.created_at}"
end

Rails Integration

app/services/scraper_service.rb
ruby
class ScraperService
  def initialize
    @client = ScrapeHub::Client.new(
      api_key: Rails.application.credentials.scrapehub_api_key
    )
  end

  def scrape_products(url)
    result = @client.scrape(
      url: url,
      engine: 'neural-x1',
      format: 'json'
    )

    # Save to database
    result.data.each do |product_data|
      Product.create!(
        name: product_data['name'],
        price: product_data['price'],
        url: product_data['url']
      )
    end

    result.data
  rescue ScrapeHub::Error => e
    Rails.logger.error "Scraping failed: #{e.message}"
    raise
  end
end

Background Jobs with Sidekiq

app/jobs/scrape_job.rb
ruby
class ScrapeJob
  include Sidekiq::Job

  def perform(url, user_id)
    client = ScrapeHub::Client.new(
      api_key: ENV['SCRAPEHUB_API_KEY']
    )

    # Create async job
    job = client.create_job(
      url: url,
      engine: 'neural-x1',
      webhook_url: Rails.application.routes.url_helpers.scrape_webhook_url
    )

    # Store job ID for tracking
    ScrapeRecord.create!(
      user_id: user_id,
      job_id: job.id,
      url: url,
      status: 'pending'
    )
  end
end

# Controller
class ScraperController < ApplicationController
  def create
    ScrapeJob.perform_async(params[:url], current_user.id)
    render json: { message: 'Scrape job started' }
  end

  def webhook
    job_id = params[:job_id]
    status = params[:status]

    record = ScrapeRecord.find_by(job_id: job_id)
    record.update!(status: status)

    if status == 'completed'
      # Process results
      client = ScrapeHub::Client.new(api_key: ENV['SCRAPEHUB_API_KEY'])
      job = client.get_job(job_id)
      # ... handle results
    end

    head :ok
  end
end

Data Export

export_data.rb
ruby
require 'scrapehub'
require 'csv'
require 'json'

client = ScrapeHub::Client.new(api_key: 'sk_live_xxxx_449x')

# Get job results
job = client.get_job('job_abc123')
data = job.results

# Export to JSON
File.open('results.json', 'w') do |f|
  f.write(JSON.pretty_generate(data))
end

# Export to CSV
CSV.open('results.csv', 'w') do |csv|
  # Write headers
  csv << data.first.keys

  # Write rows
  data.each do |row|
    csv << row.values
  end
end

# Or use built-in export
job.export_to_file('results.json', format: 'json')
job.export_to_file('results.csv', format: 'csv')

Error Handling

error_handling.rb
ruby
require 'scrapehub'

client = ScrapeHub::Client.new(api_key: 'sk_live_xxxx_449x')

begin
  result = client.scrape('https://example.com')

rescue ScrapeHub::AuthenticationError => e
  puts "Authentication failed: #{e.message}"
  puts "Check your API key"

rescue ScrapeHub::RateLimitError => e
  puts "Rate limit exceeded: #{e.message}"
  puts "Retry after: #{e.retry_after} seconds"
  sleep e.retry_after
  retry

rescue ScrapeHub::InvalidRequestError => e
  puts "Invalid request: #{e.message}"
  puts "Error details: #{e.details}"

rescue ScrapeHub::ScraperError => e
  puts "Scraper failed: #{e.message}"
  puts "Target URL: #{e.url}"
  puts "Error code: #{e.code}"

rescue ScrapeHub::Error => e
  puts "General error: #{e.message}"

rescue StandardError => e
  puts "Unexpected error: #{e.message}"
end

Testing with RSpec

spec/services/scraper_service_spec.rb
ruby
require 'rails_helper'

RSpec.describe ScraperService do
  let(:service) { described_class.new }
  let(:mock_client) { instance_double(ScrapeHub::Client) }

  before do
    allow(ScrapeHub::Client).to receive(:new).and_return(mock_client)
  end

  describe '#scrape_products' do
    let(:url) { 'https://example.com/products' }
    let(:mock_result) do
      double(
        data: [
          { 'name' => 'Product 1', 'price' => 29.99, 'url' => 'https://...' },
          { 'name' => 'Product 2', 'price' => 39.99, 'url' => 'https://...' }
        ]
      )
    end

    it 'scrapes products and saves to database' do
      expect(mock_client).to receive(:scrape).with(
        url: url,
        engine: 'neural-x1',
        format: 'json'
      ).and_return(mock_result)

      expect {
        service.scrape_products(url)
      }.to change(Product, :count).by(2)
    end

    it 'handles errors gracefully' do
      expect(mock_client).to receive(:scrape).and_raise(
        ScrapeHub::ScraperError.new('Failed to scrape')
      )

      expect {
        service.scrape_products(url)
      }.to raise_error(ScrapeHub::ScraperError)
    end
  end
end

API Reference

For complete API documentation, visit our Ruby gem reference:

View Ruby Gem Docs →