Lead Generation Guide
Extract business contacts and build qualified lead lists
What You'll Learn
- Scrape business directories and listings
- Extract contact information (emails, phones, addresses)
- Build targeted lead databases
- Enrich existing lead data
- Export leads to CRM systems
Basic Lead Scraping
scrape_leads.py
from scrapehub import ScrapeHubClient
import pandas as pd
client = ScrapeHubClient(api_key="sk_live_xxxx_449x")
# Scrape business directory
result = client.scrape(
url="https://business-directory.com/category/software-companies",
engine="neural-x1",
format="json"
)
# Process leads
leads = []
for business in result.data:
lead = {
'company_name': business.get('name'),
'email': business.get('email'),
'phone': business.get('phone'),
'website': business.get('website'),
'address': business.get('address'),
'industry': business.get('category'),
'description': business.get('description')
}
leads.append(lead)
# Export to CSV
df = pd.DataFrame(leads)
df.to_csv('leads.csv', index=False)
print(f"Extracted {len(leads)} leads")
print(f"Leads with emails: {df['email'].notna().sum()}")
print(f"Leads with phones: {df['phone'].notna().sum()}")Multi-City Lead Generation
multi_city_leads.py
from scrapehub import AsyncScrapeHubClient
import asyncio
import pandas as pd
async def scrape_city_leads(client, city, category):
"""Scrape leads for a specific city"""
result = await client.scrape(
url=f"https://business-directory.com/{city}/{category}",
engine="neural-x1",
pagination={
"enabled": True,
"max_pages": 10
}
)
# Add city to each lead
for lead in result.data:
lead['city'] = city
return result.data
async def main():
client = AsyncScrapeHubClient(api_key="sk_live_xxxx_449x")
# Target cities
cities = ['new-york', 'los-angeles', 'chicago', 'houston', 'phoenix']
category = 'restaurants'
# Scrape all cities concurrently
tasks = [scrape_city_leads(client, city, category) for city in cities]
results = await asyncio.gather(*tasks)
# Combine all leads
all_leads = []
for city_leads in results:
all_leads.extend(city_leads)
# Export
df = pd.DataFrame(all_leads)
df.to_excel('multi_city_leads.xlsx', index=False)
print(f"Total leads collected: {len(all_leads)}")
print(f"\nLeads by city:")
print(df['city'].value_counts())
asyncio.run(main())Contact Enrichment
enrich_leads.py
from scrapehub import ScrapeHubClient
import pandas as pd
class LeadEnricher:
def __init__(self, api_key):
self.client = ScrapeHubClient(api_key=api_key)
def enrich_from_website(self, website_url):
"""Extract additional info from company website"""
result = self.client.scrape(
url=website_url,
engine="neural-x1",
render_js=True
)
if result.data:
company_data = result.data[0]
return {
'email': company_data.get('contact_email'),
'phone': company_data.get('phone'),
'social_linkedin': company_data.get('linkedin_url'),
'social_twitter': company_data.get('twitter_url'),
'employee_count': company_data.get('employees'),
'founded_year': company_data.get('founded')
}
return {}
def enrich_leads(self, leads_df):
"""Enrich existing lead database with additional data"""
enriched_leads = []
for idx, lead in leads_df.iterrows():
print(f"Enriching {idx + 1}/{len(leads_df)}: {lead['company_name']}")
enriched = lead.to_dict()
# Scrape company website if available
if pd.notna(lead['website']):
extra_data = self.enrich_from_website(lead['website'])
enriched.update(extra_data)
enriched_leads.append(enriched)
return pd.DataFrame(enriched_leads)
# Usage
enricher = LeadEnricher("sk_live_xxxx_449x")
# Load existing leads
leads_df = pd.read_csv('leads.csv')
# Enrich with additional data
enriched_df = enricher.enrich_leads(leads_df.head(50)) # First 50 leads
# Save enriched data
enriched_df.to_csv('enriched_leads.csv', index=False)
print(f"\nEnrichment complete!")
print(f"Leads with LinkedIn: {enriched_df['social_linkedin'].notna().sum()}")
print(f"Leads with employee count: {enriched_df['employee_count'].notna().sum()}")Industry-Specific Lead Lists
industry_leads.py
from scrapehub import ScrapeHubClient
import pandas as pd
client = ScrapeHubClient(api_key="sk_live_xxxx_449x")
# Target multiple industries
industries = [
'software-development',
'digital-marketing',
'consulting',
'real-estate',
'healthcare'
]
all_leads = []
for industry in industries:
print(f"\nScraping {industry}...")
result = client.scrape(
url=f"https://business-directory.com/category/{industry}",
engine="neural-x1",
pagination={
"enabled": True,
"max_pages": 20
}
)
# Add industry tag
for lead in result.data:
lead['industry'] = industry
all_leads.append(lead)
print(f" Found {len(result.data)} leads")
# Create DataFrame
df = pd.DataFrame(all_leads)
# Filter quality leads (have email AND phone)
quality_leads = df[df['email'].notna() & df['phone'].notna()]
print(f"\n=== Lead Generation Summary ===")
print(f"Total leads: {len(df)}")
print(f"Quality leads (email + phone): {len(quality_leads)}")
print(f"\nLeads by industry:")
print(df['industry'].value_counts())
# Export both sets
df.to_excel('all_leads.xlsx', index=False, sheet_name='All Leads')
quality_leads.to_excel('quality_leads.xlsx', index=False, sheet_name='Quality Leads')
print("\nExported to all_leads.xlsx and quality_leads.xlsx")LinkedIn Company Scraping
linkedin_leads.py
from scrapehub import ScrapeHubClient
import pandas as pd
client = ScrapeHubClient(api_key="sk_live_xxxx_449x")
def scrape_linkedin_companies(search_query, location=None):
"""Scrape companies from LinkedIn search"""
# Build search URL
url = f"https://www.linkedin.com/search/results/companies/?keywords={search_query}"
if location:
url += f"&location={location}"
result = client.scrape(
url=url,
engine="neural-x1",
render_js=True,
wait_for_selector=".search-results-container",
pagination={
"enabled": True,
"max_pages": 5
}
)
return result.data
# Search for SaaS companies in San Francisco
companies = scrape_linkedin_companies(
search_query="SaaS software",
location="San Francisco Bay Area"
)
# Process and export
df = pd.DataFrame(companies)
print(f"Found {len(companies)} companies")
print(f"\nCompany size distribution:")
if 'company_size' in df.columns:
print(df['company_size'].value_counts())
# Export
df.to_csv('linkedin_companies.csv', index=False)
print("\nExported to linkedin_companies.csv")Email Finder & Validation
email_finder.py
from scrapehub import ScrapeHubClient
import pandas as pd
import re
class EmailFinder:
def __init__(self, api_key):
self.client = ScrapeHubClient(api_key=api_key)
def find_emails_on_website(self, url):
"""Find all email addresses on a website"""
result = self.client.scrape(
url=url,
engine="neural-x1",
render_js=True
)
emails = set()
email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
if result.data:
# Extract from structured data
for item in result.data:
if isinstance(item, dict):
for value in item.values():
if isinstance(value, str):
found = re.findall(email_pattern, value)
emails.update(found)
return list(emails)
def find_contact_page_emails(self, domain):
"""Check contact/about pages for emails"""
contact_pages = [
f"https://{domain}/contact",
f"https://{domain}/about",
f"https://{domain}/contact-us",
f"https://{domain}/team"
]
all_emails = []
for page in contact_pages:
try:
emails = self.find_emails_on_website(page)
all_emails.extend(emails)
if emails:
print(f" Found {len(emails)} emails on {page}")
except:
continue
return list(set(all_emails)) # Remove duplicates
# Usage
finder = EmailFinder("sk_live_xxxx_449x")
# Load leads without emails
leads_df = pd.read_csv('leads.csv')
missing_email = leads_df[leads_df['email'].isna()]
print(f"Finding emails for {len(missing_email)} leads...")
for idx, lead in missing_email.iterrows():
if pd.notna(lead['website']):
print(f"\n{lead['company_name']}")
# Extract domain
domain = lead['website'].replace('http://', '').replace('https://', '').split('/')[0]
# Find emails
emails = finder.find_contact_page_emails(domain)
if emails:
print(f" ✓ Found: {', '.join(emails)}")
leads_df.at[idx, 'email'] = emails[0] # Use first email
else:
print(f" ✗ No emails found")
# Save updated leads
leads_df.to_csv('leads_with_emails.csv', index=False)
print(f"\nUpdated leads saved to leads_with_emails.csv")CRM Integration (Salesforce)
export_to_crm.py
from scrapehub import ScrapeHubClient
from simple_salesforce import Salesforce
import pandas as pd
class CRMExporter:
def __init__(self, scrapehub_api_key, sf_username, sf_password, sf_token):
self.scraper = ScrapeHubClient(api_key=scrapehub_api_key)
self.sf = Salesforce(
username=sf_username,
password=sf_password,
security_token=sf_token
)
def scrape_and_export(self, url, lead_source):
"""Scrape leads and export to Salesforce"""
# Scrape leads
result = self.scraper.scrape(
url=url,
engine="neural-x1",
pagination={"enabled": True, "max_pages": 10}
)
print(f"Scraped {len(result.data)} leads")
# Export to Salesforce
success_count = 0
for lead in result.data:
try:
self.sf.Lead.create({
'Company': lead.get('company_name'),
'LastName': lead.get('contact_name', 'Unknown'),
'Email': lead.get('email'),
'Phone': lead.get('phone'),
'Website': lead.get('website'),
'Street': lead.get('address'),
'Industry': lead.get('industry'),
'LeadSource': lead_source,
'Description': lead.get('description'),
'Status': 'New'
})
success_count += 1
except Exception as e:
print(f"Error creating lead: {e}")
print(f"\nSuccessfully exported {success_count}/{len(result.data)} leads to Salesforce")
# Usage
exporter = CRMExporter(
scrapehub_api_key="sk_live_xxxx_449x",
sf_username="your-sf-username",
sf_password="your-sf-password",
sf_token="your-sf-token"
)
# Scrape and export
exporter.scrape_and_export(
url="https://business-directory.com/category/saas-companies",
lead_source="Web Scraping - Business Directory"
)Best Practices
- Always verify email addresses before using them for outreach
- Comply with GDPR, CCPA, and local data privacy regulations
- Use pagination to collect comprehensive lead lists
- Enrich leads with social media profiles for better targeting
- Segment leads by industry, location, or company size
- Regularly update lead data to maintain accuracy
Common Data Sources
Business Directories
- Yellow Pages
- Yelp
- Google My Business
- Industry-specific directories
Professional Networks
- LinkedIn Companies
- Crunchbase
- AngelList
- Product Hunt
Review Sites
- G2
- Capterra
- TrustPilot
- Clutch
Industry Platforms
- Trade associations
- Conference attendees
- Award winners
- Press releases