Lead Generation Guide

Extract business contacts and build qualified lead lists

What You'll Learn

Scrape business directories and listings
Extract contact information (emails, phones, addresses)
Build targeted lead databases
Enrich existing lead data
Export leads to CRM systems

Basic Lead Scraping

scrape_leads.py

python

from scrapehub import ScrapeHubClient
import pandas as pd

client = ScrapeHubClient(api_key="sk_live_xxxx_449x")

# Scrape business directory
result = client.scrape(
    url="https://business-directory.com/category/software-companies",
    engine="neural-x1",
    format="json"
)

# Process leads
leads = []
for business in result.data:
    lead = {
        'company_name': business.get('name'),
        'email': business.get('email'),
        'phone': business.get('phone'),
        'website': business.get('website'),
        'address': business.get('address'),
        'industry': business.get('category'),
        'description': business.get('description')
    }
    leads.append(lead)

# Export to CSV
df = pd.DataFrame(leads)
df.to_csv('leads.csv', index=False)

print(f"Extracted {len(leads)} leads")
print(f"Leads with emails: {df['email'].notna().sum()}")
print(f"Leads with phones: {df['phone'].notna().sum()}")

Multi-City Lead Generation

multi_city_leads.py

python

from scrapehub import AsyncScrapeHubClient
import asyncio
import pandas as pd

async def scrape_city_leads(client, city, category):
    """Scrape leads for a specific city"""
    result = await client.scrape(
        url=f"https://business-directory.com/{city}/{category}",
        engine="neural-x1",
        pagination={
            "enabled": True,
            "max_pages": 10
        }
    )

    # Add city to each lead
    for lead in result.data:
        lead['city'] = city

    return result.data

async def main():
    client = AsyncScrapeHubClient(api_key="sk_live_xxxx_449x")

    # Target cities
    cities = ['new-york', 'los-angeles', 'chicago', 'houston', 'phoenix']
    category = 'restaurants'

    # Scrape all cities concurrently
    tasks = [scrape_city_leads(client, city, category) for city in cities]
    results = await asyncio.gather(*tasks)

    # Combine all leads
    all_leads = []
    for city_leads in results:
        all_leads.extend(city_leads)

    # Export
    df = pd.DataFrame(all_leads)
    df.to_excel('multi_city_leads.xlsx', index=False)

    print(f"Total leads collected: {len(all_leads)}")
    print(f"\nLeads by city:")
    print(df['city'].value_counts())

asyncio.run(main())

Contact Enrichment

enrich_leads.py

python

from scrapehub import ScrapeHubClient
import pandas as pd

class LeadEnricher:
    def __init__(self, api_key):
        self.client = ScrapeHubClient(api_key=api_key)

    def enrich_from_website(self, website_url):
        """Extract additional info from company website"""
        result = self.client.scrape(
            url=website_url,
            engine="neural-x1",
            render_js=True
        )

        if result.data:
            company_data = result.data[0]
            return {
                'email': company_data.get('contact_email'),
                'phone': company_data.get('phone'),
                'social_linkedin': company_data.get('linkedin_url'),
                'social_twitter': company_data.get('twitter_url'),
                'employee_count': company_data.get('employees'),
                'founded_year': company_data.get('founded')
            }

        return {}

    def enrich_leads(self, leads_df):
        """Enrich existing lead database with additional data"""
        enriched_leads = []

        for idx, lead in leads_df.iterrows():
            print(f"Enriching {idx + 1}/{len(leads_df)}: {lead['company_name']}")

            enriched = lead.to_dict()

            # Scrape company website if available
            if pd.notna(lead['website']):
                extra_data = self.enrich_from_website(lead['website'])
                enriched.update(extra_data)

            enriched_leads.append(enriched)

        return pd.DataFrame(enriched_leads)

# Usage
enricher = LeadEnricher("sk_live_xxxx_449x")

# Load existing leads
leads_df = pd.read_csv('leads.csv')

# Enrich with additional data
enriched_df = enricher.enrich_leads(leads_df.head(50))  # First 50 leads

# Save enriched data
enriched_df.to_csv('enriched_leads.csv', index=False)

print(f"\nEnrichment complete!")
print(f"Leads with LinkedIn: {enriched_df['social_linkedin'].notna().sum()}")
print(f"Leads with employee count: {enriched_df['employee_count'].notna().sum()}")

Industry-Specific Lead Lists

industry_leads.py

python

from scrapehub import ScrapeHubClient
import pandas as pd

client = ScrapeHubClient(api_key="sk_live_xxxx_449x")

# Target multiple industries
industries = [
    'software-development',
    'digital-marketing',
    'consulting',
    'real-estate',
    'healthcare'
]

all_leads = []

for industry in industries:
    print(f"\nScraping {industry}...")

    result = client.scrape(
        url=f"https://business-directory.com/category/{industry}",
        engine="neural-x1",
        pagination={
            "enabled": True,
            "max_pages": 20
        }
    )

    # Add industry tag
    for lead in result.data:
        lead['industry'] = industry
        all_leads.append(lead)

    print(f"  Found {len(result.data)} leads")

# Create DataFrame
df = pd.DataFrame(all_leads)

# Filter quality leads (have email AND phone)
quality_leads = df[df['email'].notna() & df['phone'].notna()]

print(f"\n=== Lead Generation Summary ===")
print(f"Total leads: {len(df)}")
print(f"Quality leads (email + phone): {len(quality_leads)}")
print(f"\nLeads by industry:")
print(df['industry'].value_counts())

# Export both sets
df.to_excel('all_leads.xlsx', index=False, sheet_name='All Leads')
quality_leads.to_excel('quality_leads.xlsx', index=False, sheet_name='Quality Leads')

print("\nExported to all_leads.xlsx and quality_leads.xlsx")

LinkedIn Company Scraping

linkedin_leads.py

python

from scrapehub import ScrapeHubClient
import pandas as pd

client = ScrapeHubClient(api_key="sk_live_xxxx_449x")

def scrape_linkedin_companies(search_query, location=None):
    """Scrape companies from LinkedIn search"""

    # Build search URL
    url = f"https://www.linkedin.com/search/results/companies/?keywords={search_query}"
    if location:
        url += f"&location={location}"

    result = client.scrape(
        url=url,
        engine="neural-x1",
        render_js=True,
        wait_for_selector=".search-results-container",
        pagination={
            "enabled": True,
            "max_pages": 5
        }
    )

    return result.data

# Search for SaaS companies in San Francisco
companies = scrape_linkedin_companies(
    search_query="SaaS software",
    location="San Francisco Bay Area"
)

# Process and export
df = pd.DataFrame(companies)

print(f"Found {len(companies)} companies")
print(f"\nCompany size distribution:")
if 'company_size' in df.columns:
    print(df['company_size'].value_counts())

# Export
df.to_csv('linkedin_companies.csv', index=False)
print("\nExported to linkedin_companies.csv")

Email Finder & Validation

email_finder.py

python

from scrapehub import ScrapeHubClient
import pandas as pd
import re

class EmailFinder:
    def __init__(self, api_key):
        self.client = ScrapeHubClient(api_key=api_key)

    def find_emails_on_website(self, url):
        """Find all email addresses on a website"""
        result = self.client.scrape(
            url=url,
            engine="neural-x1",
            render_js=True
        )

        emails = set()
        email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'

        if result.data:
            # Extract from structured data
            for item in result.data:
                if isinstance(item, dict):
                    for value in item.values():
                        if isinstance(value, str):
                            found = re.findall(email_pattern, value)
                            emails.update(found)

        return list(emails)

    def find_contact_page_emails(self, domain):
        """Check contact/about pages for emails"""
        contact_pages = [
            f"https://{domain}/contact",
            f"https://{domain}/about",
            f"https://{domain}/contact-us",
            f"https://{domain}/team"
        ]

        all_emails = []

        for page in contact_pages:
            try:
                emails = self.find_emails_on_website(page)
                all_emails.extend(emails)
                if emails:
                    print(f"  Found {len(emails)} emails on {page}")
            except:
                continue

        return list(set(all_emails))  # Remove duplicates

# Usage
finder = EmailFinder("sk_live_xxxx_449x")

# Load leads without emails
leads_df = pd.read_csv('leads.csv')
missing_email = leads_df[leads_df['email'].isna()]

print(f"Finding emails for {len(missing_email)} leads...")

for idx, lead in missing_email.iterrows():
    if pd.notna(lead['website']):
        print(f"\n{lead['company_name']}")

        # Extract domain
        domain = lead['website'].replace('http://', '').replace('https://', '').split('/')[0]

        # Find emails
        emails = finder.find_contact_page_emails(domain)

        if emails:
            print(f"  ✓ Found: {', '.join(emails)}")
            leads_df.at[idx, 'email'] = emails[0]  # Use first email
        else:
            print(f"  ✗ No emails found")

# Save updated leads
leads_df.to_csv('leads_with_emails.csv', index=False)
print(f"\nUpdated leads saved to leads_with_emails.csv")

CRM Integration (Salesforce)

export_to_crm.py

python

from scrapehub import ScrapeHubClient
from simple_salesforce import Salesforce
import pandas as pd

class CRMExporter:
    def __init__(self, scrapehub_api_key, sf_username, sf_password, sf_token):
        self.scraper = ScrapeHubClient(api_key=scrapehub_api_key)
        self.sf = Salesforce(
            username=sf_username,
            password=sf_password,
            security_token=sf_token
        )

    def scrape_and_export(self, url, lead_source):
        """Scrape leads and export to Salesforce"""

        # Scrape leads
        result = self.scraper.scrape(
            url=url,
            engine="neural-x1",
            pagination={"enabled": True, "max_pages": 10}
        )

        print(f"Scraped {len(result.data)} leads")

        # Export to Salesforce
        success_count = 0
        for lead in result.data:
            try:
                self.sf.Lead.create({
                    'Company': lead.get('company_name'),
                    'LastName': lead.get('contact_name', 'Unknown'),
                    'Email': lead.get('email'),
                    'Phone': lead.get('phone'),
                    'Website': lead.get('website'),
                    'Street': lead.get('address'),
                    'Industry': lead.get('industry'),
                    'LeadSource': lead_source,
                    'Description': lead.get('description'),
                    'Status': 'New'
                })
                success_count += 1
            except Exception as e:
                print(f"Error creating lead: {e}")

        print(f"\nSuccessfully exported {success_count}/{len(result.data)} leads to Salesforce")

# Usage
exporter = CRMExporter(
    scrapehub_api_key="sk_live_xxxx_449x",
    sf_username="your-sf-username",
    sf_password="your-sf-password",
    sf_token="your-sf-token"
)

# Scrape and export
exporter.scrape_and_export(
    url="https://business-directory.com/category/saas-companies",
    lead_source="Web Scraping - Business Directory"
)

Best Practices

Always verify email addresses before using them for outreach
Comply with GDPR, CCPA, and local data privacy regulations
Use pagination to collect comprehensive lead lists
Enrich leads with social media profiles for better targeting
Segment leads by industry, location, or company size
Regularly update lead data to maintain accuracy

Common Data Sources

Business Directories

Yellow Pages
Yelp
Google My Business
Industry-specific directories

Professional Networks

LinkedIn Companies
Crunchbase
AngelList
Product Hunt

Review Sites

G2
Capterra
TrustPilot
Clutch

Industry Platforms

Trade associations
Conference attendees
Award winners
Press releases