Custom Rules

Define precise extraction rules and custom scraping logic

About Custom Rules

Custom Rules allow you to define precise extraction logic using CSS selectors, XPath, regular expressions, and custom JavaScript. Perfect when you need full control over the scraping process or when working with complex page structures.

Basic Extraction Rules

CSS Selectors

css_selectors.py

python

from scrapehub import ScrapeHubClient

client = ScrapeHubClient(api_key="your_api_key")

result = client.scrape(
    url="https://example.com",
    rules={
        "title": {
            "selector": "h1.product-title",
            "type": "text"
        },
        "price": {
            "selector": ".price-value",
            "type": "text",
            "transform": "number"  # Convert to number
        },
        "image": {
            "selector": "img.product-image",
            "attribute": "src"  # Extract attribute instead of text
        }
    }
)

print(result.data)
# {
#   "title": "Product Name",
#   "price": 99.99,
#   "image": "https://example.com/image.jpg"
# }

XPath Expressions

xpath_rules.py

python

result = client.scrape(
    url="https://example.com",
    rules={
        "title": {
            "xpath": "//h1[@class='product-title']/text()",
            "type": "text"
        },
        "description": {
            "xpath": "//div[@class='description']//p/text()",
            "type": "text",
            "join": " "  # Join multiple matches with space
        },
        "specs": {
            "xpath": "//table[@class='specs']//tr",
            "type": "list",
            "item": {
                "name": "./td[1]/text()",
                "value": "./td[2]/text()"
            }
        }
    }
)

print(result.data)

Node.js Example

custom_rules.js

javascript

const { ScrapeHubClient } = require('@scrapehub/node');

const client = new ScrapeHubClient({
  apiKey: process.env.SCRAPEHUB_API_KEY
});

async function scrapeWithRules() {
  const result = await client.scrape({
    url: 'https://example.com',
    rules: {
      title: {
        selector: 'h1.product-title',
        type: 'text'
      },
      price: {
        selector: '.price-value',
        type: 'text',
        transform: 'number'
      }
    }
  });

  console.log(result.data);
}

scrapeWithRules();

List Extraction

Extract Multiple Items

list_extraction.py

python

result = client.scrape(
    url="https://example.com/products",
    rules={
        "products": {
            "selector": ".product-card",
            "type": "list",
            "item": {
                "name": {
                    "selector": ".product-name",
                    "type": "text"
                },
                "price": {
                    "selector": ".price",
                    "type": "text",
                    "transform": "number"
                },
                "url": {
                    "selector": "a.product-link",
                    "attribute": "href"
                },
                "rating": {
                    "selector": ".rating",
                    "attribute": "data-rating",
                    "transform": "number"
                }
            }
        }
    }
)

# Access the list
for product in result.data['products']:
    print(f"{product['name']}: ${product['price']}")

Nested Lists

nested_lists.py

python

result = client.scrape(
    url="https://example.com/categories",
    rules={
        "categories": {
            "selector": ".category",
            "type": "list",
            "item": {
                "name": {
                    "selector": ".category-name",
                    "type": "text"
                },
                "products": {
                    "selector": ".product",
                    "type": "list",
                    "item": {
                        "name": ".product-name",
                        "price": ".product-price"
                    }
                }
            }
        }
    }
)

print(result.data)

Data Transformations

Built-in Transformers

transformers.py

python

result = client.scrape(
    url="https://example.com",
    rules={
        "price": {
            "selector": ".price",
            "transform": "number"  # Extract numeric value
        },
        "date": {
            "selector": ".published-date",
            "transform": "date"  # Parse and normalize date
        },
        "email": {
            "selector": ".contact",
            "transform": "email"  # Extract email from text
        },
        "phone": {
            "selector": ".phone",
            "transform": "phone"  # Extract and format phone
        },
        "url": {
            "selector": "a",
            "attribute": "href",
            "transform": "url"  # Convert to absolute URL
        },
        "text": {
            "selector": ".description",
            "transform": "trim"  # Trim whitespace
        }
    }
)

Regular Expressions

regex_transform.py

python

result = client.scrape(
    url="https://example.com",
    rules={
        "sku": {
            "selector": ".product-info",
            "type": "text",
            "regex": r"SKU:\s*([A-Z0-9-]+)",  # Extract SKU
            "regex_group": 1  # Use first capture group
        },
        "dimensions": {
            "selector": ".specs",
            "type": "text",
            "regex": r"(\d+)\s*x\s*(\d+)\s*x\s*(\d+)",
            "regex_group": "all"  # Get all groups as list
        },
        "price_numbers": {
            "selector": ".pricing",
            "type": "text",
            "regex": r"\d+\.\d+",  # Find all price numbers
            "regex_all": True  # Get all matches
        }
    }
)

print(result.data)

Custom JavaScript Transform

custom_transform.py

python

result = client.scrape(
    url="https://example.com",
    rules={
        "complex_data": {
            "selector": ".data-container",
            "type": "text",
            "custom_transform": """
                function(value) {
                    // Custom transformation logic
                    const parts = value.split('|');
                    return {
                        name: parts[0].trim(),
                        quantity: parseInt(parts[1]),
                        available: parts[2] === 'yes'
                    };
                }
            """
        }
    }
)

print(result.data)

Conditional Extraction

Conditional Rules

conditional_rules.py

python

result = client.scrape(
    url="https://example.com",
    rules={
        "product_type": {
            "selector": ".product-type",
            "type": "text"
        },
        "digital_link": {
            "selector": ".download-link",
            "attribute": "href",
            "condition": {
                "field": "product_type",
                "equals": "digital"  # Only extract if digital product
            }
        },
        "shipping_weight": {
            "selector": ".weight",
            "type": "text",
            "condition": {
                "field": "product_type",
                "equals": "physical"
            }
        }
    }
)

print(result.data)

Pagination Rules

pagination_rules.py

python

result = client.scrape(
    url="https://example.com/products",
    rules={
        "products": {
            "selector": ".product",
            "type": "list",
            "item": {
                "name": ".product-name",
                "price": ".product-price"
            }
        }
    },
    pagination={
        "next_selector": "a.next-page",  # Next button selector
        "max_pages": 10,
        "wait_time": 2,  # Seconds between pages
        "stop_condition": {
            "selector": ".no-more-products",  # Stop if this appears
            "exists": True
        }
    }
)

# All pages are scraped automatically
print(f"Total products: {len(result.data['products'])}")

Advanced Features

Dynamic Content Handling

dynamic_content.py

python

result = client.scrape(
    url="https://example.com",
    rules={
        "lazy_loaded_images": {
            "selector": "img[data-src]",
            "attribute": "data-src",
            "type": "list",
            "wait_for": {
                "selector": "img[src]",  # Wait until images load
                "timeout": 10
            }
        }
    },
    javascript={
        "enabled": True,
        "scroll": True,  # Scroll to trigger lazy loading
        "wait": 2000  # Wait for content to load
    }
)

Custom JavaScript Execution

custom_javascript.py

python

result = client.scrape(
    url="https://example.com",
    javascript={
        "enabled": True,
        "before_scrape": """
            // Execute before extraction
            document.querySelector('.load-more').click();
            await new Promise(r => setTimeout(r, 2000));
        """,
        "custom_extraction": """
            // Custom extraction logic
            return {
                custom_data: window.myAppData,
                computed_value: calculateSomething()
            };
        """
    },
    rules={
        "title": ".title",
        "price": ".price"
    }
)

Error Handling in Rules

rules_error_handling.py

python

result = client.scrape(
    url="https://example.com",
    rules={
        "price": {
            "selector": ".price",
            "type": "text",
            "required": True,  # Fail if not found
            "default": None  # Use default if not found (overrides required)
        },
        "optional_field": {
            "selector": ".optional",
            "required": False,
            "default": "N/A"
        },
        "fallback_example": {
            "selectors": [  # Try multiple selectors
                ".primary-selector",
                ".backup-selector",
                ".last-resort-selector"
            ],
            "type": "text"
        }
    }
)

Rule Templates

Save and reuse common rule configurations:

rule_templates.py

python

# Save a rule template
template = client.rules.create_template(
    name="product-extraction",
    rules={
        "name": {
            "selector": ".product-name",
            "type": "text"
        },
        "price": {
            "selector": ".price",
            "transform": "number"
        },
        "rating": {
            "selector": ".rating",
            "attribute": "data-rating",
            "transform": "number"
        }
    }
)

# Use the template
result = client.scrape(
    url="https://example.com",
    template="product-extraction"  # or template=template.id
)

# List all templates
templates = client.rules.list_templates()
for t in templates:
    print(f"{t.name}: {t.id}")

Best Practices

Custom Rules Tips

Use specific selectors to avoid extracting wrong data
Prefer CSS selectors over XPath for better performance
Always validate and transform data to expected formats
Use fallback selectors for resilient extraction
Test rules on multiple pages before production use
Create reusable templates for common patterns
Combine with Neural Engine for more robust extraction

Important Notes

Custom rules require manual updates when site structure changes
Complex JavaScript transforms may impact performance
Always handle missing or optional fields gracefully
Test extraction rules thoroughly before scaling up

Custom Rules

About Custom Rules

Basic Extraction Rules

CSS Selectors

XPath Expressions

Node.js Example

List Extraction

Extract Multiple Items

Nested Lists

Data Transformations

Built-in Transformers

Regular Expressions

Custom JavaScript Transform

Conditional Extraction

Conditional Rules

Pagination Rules

Advanced Features

Dynamic Content Handling

Custom JavaScript Execution

Error Handling in Rules

Rule Templates

Best Practices

Custom Rules Tips

Important Notes

Next Steps