Custom Rules

Define precise extraction rules and custom scraping logic

About Custom Rules

Custom Rules allow you to define precise extraction logic using CSS selectors, XPath, regular expressions, and custom JavaScript. Perfect when you need full control over the scraping process or when working with complex page structures.

Basic Extraction Rules

CSS Selectors

css_selectors.py
python
from scrapehub import ScrapeHubClient

client = ScrapeHubClient(api_key="your_api_key")

result = client.scrape(
    url="https://example.com",
    rules={
        "title": {
            "selector": "h1.product-title",
            "type": "text"
        },
        "price": {
            "selector": ".price-value",
            "type": "text",
            "transform": "number"  # Convert to number
        },
        "image": {
            "selector": "img.product-image",
            "attribute": "src"  # Extract attribute instead of text
        }
    }
)

print(result.data)
# {
#   "title": "Product Name",
#   "price": 99.99,
#   "image": "https://example.com/image.jpg"
# }

XPath Expressions

xpath_rules.py
python
result = client.scrape(
    url="https://example.com",
    rules={
        "title": {
            "xpath": "//h1[@class='product-title']/text()",
            "type": "text"
        },
        "description": {
            "xpath": "//div[@class='description']//p/text()",
            "type": "text",
            "join": " "  # Join multiple matches with space
        },
        "specs": {
            "xpath": "//table[@class='specs']//tr",
            "type": "list",
            "item": {
                "name": "./td[1]/text()",
                "value": "./td[2]/text()"
            }
        }
    }
)

print(result.data)

Node.js Example

custom_rules.js
javascript
const { ScrapeHubClient } = require('@scrapehub/node');

const client = new ScrapeHubClient({
  apiKey: process.env.SCRAPEHUB_API_KEY
});

async function scrapeWithRules() {
  const result = await client.scrape({
    url: 'https://example.com',
    rules: {
      title: {
        selector: 'h1.product-title',
        type: 'text'
      },
      price: {
        selector: '.price-value',
        type: 'text',
        transform: 'number'
      }
    }
  });

  console.log(result.data);
}

scrapeWithRules();

List Extraction

Extract Multiple Items

list_extraction.py
python
result = client.scrape(
    url="https://example.com/products",
    rules={
        "products": {
            "selector": ".product-card",
            "type": "list",
            "item": {
                "name": {
                    "selector": ".product-name",
                    "type": "text"
                },
                "price": {
                    "selector": ".price",
                    "type": "text",
                    "transform": "number"
                },
                "url": {
                    "selector": "a.product-link",
                    "attribute": "href"
                },
                "rating": {
                    "selector": ".rating",
                    "attribute": "data-rating",
                    "transform": "number"
                }
            }
        }
    }
)

# Access the list
for product in result.data['products']:
    print(f"{product['name']}: ${product['price']}")

Nested Lists

nested_lists.py
python
result = client.scrape(
    url="https://example.com/categories",
    rules={
        "categories": {
            "selector": ".category",
            "type": "list",
            "item": {
                "name": {
                    "selector": ".category-name",
                    "type": "text"
                },
                "products": {
                    "selector": ".product",
                    "type": "list",
                    "item": {
                        "name": ".product-name",
                        "price": ".product-price"
                    }
                }
            }
        }
    }
)

print(result.data)

Data Transformations

Built-in Transformers

transformers.py
python
result = client.scrape(
    url="https://example.com",
    rules={
        "price": {
            "selector": ".price",
            "transform": "number"  # Extract numeric value
        },
        "date": {
            "selector": ".published-date",
            "transform": "date"  # Parse and normalize date
        },
        "email": {
            "selector": ".contact",
            "transform": "email"  # Extract email from text
        },
        "phone": {
            "selector": ".phone",
            "transform": "phone"  # Extract and format phone
        },
        "url": {
            "selector": "a",
            "attribute": "href",
            "transform": "url"  # Convert to absolute URL
        },
        "text": {
            "selector": ".description",
            "transform": "trim"  # Trim whitespace
        }
    }
)

Regular Expressions

regex_transform.py
python
result = client.scrape(
    url="https://example.com",
    rules={
        "sku": {
            "selector": ".product-info",
            "type": "text",
            "regex": r"SKU:\s*([A-Z0-9-]+)",  # Extract SKU
            "regex_group": 1  # Use first capture group
        },
        "dimensions": {
            "selector": ".specs",
            "type": "text",
            "regex": r"(\d+)\s*x\s*(\d+)\s*x\s*(\d+)",
            "regex_group": "all"  # Get all groups as list
        },
        "price_numbers": {
            "selector": ".pricing",
            "type": "text",
            "regex": r"\d+\.\d+",  # Find all price numbers
            "regex_all": True  # Get all matches
        }
    }
)

print(result.data)

Custom JavaScript Transform

custom_transform.py
python
result = client.scrape(
    url="https://example.com",
    rules={
        "complex_data": {
            "selector": ".data-container",
            "type": "text",
            "custom_transform": """
                function(value) {
                    // Custom transformation logic
                    const parts = value.split('|');
                    return {
                        name: parts[0].trim(),
                        quantity: parseInt(parts[1]),
                        available: parts[2] === 'yes'
                    };
                }
            """
        }
    }
)

print(result.data)

Conditional Extraction

Conditional Rules

conditional_rules.py
python
result = client.scrape(
    url="https://example.com",
    rules={
        "product_type": {
            "selector": ".product-type",
            "type": "text"
        },
        "digital_link": {
            "selector": ".download-link",
            "attribute": "href",
            "condition": {
                "field": "product_type",
                "equals": "digital"  # Only extract if digital product
            }
        },
        "shipping_weight": {
            "selector": ".weight",
            "type": "text",
            "condition": {
                "field": "product_type",
                "equals": "physical"
            }
        }
    }
)

print(result.data)

Pagination Rules

pagination_rules.py
python
result = client.scrape(
    url="https://example.com/products",
    rules={
        "products": {
            "selector": ".product",
            "type": "list",
            "item": {
                "name": ".product-name",
                "price": ".product-price"
            }
        }
    },
    pagination={
        "next_selector": "a.next-page",  # Next button selector
        "max_pages": 10,
        "wait_time": 2,  # Seconds between pages
        "stop_condition": {
            "selector": ".no-more-products",  # Stop if this appears
            "exists": True
        }
    }
)

# All pages are scraped automatically
print(f"Total products: {len(result.data['products'])}")

Advanced Features

Dynamic Content Handling

dynamic_content.py
python
result = client.scrape(
    url="https://example.com",
    rules={
        "lazy_loaded_images": {
            "selector": "img[data-src]",
            "attribute": "data-src",
            "type": "list",
            "wait_for": {
                "selector": "img[src]",  # Wait until images load
                "timeout": 10
            }
        }
    },
    javascript={
        "enabled": True,
        "scroll": True,  # Scroll to trigger lazy loading
        "wait": 2000  # Wait for content to load
    }
)

Custom JavaScript Execution

custom_javascript.py
python
result = client.scrape(
    url="https://example.com",
    javascript={
        "enabled": True,
        "before_scrape": """
            // Execute before extraction
            document.querySelector('.load-more').click();
            await new Promise(r => setTimeout(r, 2000));
        """,
        "custom_extraction": """
            // Custom extraction logic
            return {
                custom_data: window.myAppData,
                computed_value: calculateSomething()
            };
        """
    },
    rules={
        "title": ".title",
        "price": ".price"
    }
)

Error Handling in Rules

rules_error_handling.py
python
result = client.scrape(
    url="https://example.com",
    rules={
        "price": {
            "selector": ".price",
            "type": "text",
            "required": True,  # Fail if not found
            "default": None  # Use default if not found (overrides required)
        },
        "optional_field": {
            "selector": ".optional",
            "required": False,
            "default": "N/A"
        },
        "fallback_example": {
            "selectors": [  # Try multiple selectors
                ".primary-selector",
                ".backup-selector",
                ".last-resort-selector"
            ],
            "type": "text"
        }
    }
)

Rule Templates

Save and reuse common rule configurations:

rule_templates.py
python
# Save a rule template
template = client.rules.create_template(
    name="product-extraction",
    rules={
        "name": {
            "selector": ".product-name",
            "type": "text"
        },
        "price": {
            "selector": ".price",
            "transform": "number"
        },
        "rating": {
            "selector": ".rating",
            "attribute": "data-rating",
            "transform": "number"
        }
    }
)

# Use the template
result = client.scrape(
    url="https://example.com",
    template="product-extraction"  # or template=template.id
)

# List all templates
templates = client.rules.list_templates()
for t in templates:
    print(f"{t.name}: {t.id}")

Best Practices

Custom Rules Tips

  • Use specific selectors to avoid extracting wrong data
  • Prefer CSS selectors over XPath for better performance
  • Always validate and transform data to expected formats
  • Use fallback selectors for resilient extraction
  • Test rules on multiple pages before production use
  • Create reusable templates for common patterns
  • Combine with Neural Engine for more robust extraction

Important Notes

  • Custom rules require manual updates when site structure changes
  • Complex JavaScript transforms may impact performance
  • Always handle missing or optional fields gracefully
  • Test extraction rules thoroughly before scaling up