SEO Data Extraction Guide
Extract SERP data, keywords, and competitor SEO metrics
What You'll Learn
- Scrape Google SERP results and rankings
- Extract meta tags, headers, and on-page SEO elements
- Monitor keyword positions and rank tracking
- Analyze competitor backlinks and content
- Build automated SEO auditing tools
SERP Scraping (Search Results)
scrape_serp.py
from scrapehub import ScrapeHubClient
import pandas as pd
client = ScrapeHubClient(api_key="sk_live_xxxx_449x")
def scrape_google_serp(keyword, location='United States'):
"""Scrape Google search results for a keyword"""
# Build Google search URL
search_url = f"https://www.google.com/search?q={keyword.replace(' ', '+')}"
result = client.scrape(
url=search_url,
engine="neural-x1",
render_js=True,
wait_for_selector="#search"
)
# Extract SERP features
serp_results = []
for item in result.data:
serp_results.append({
'keyword': keyword,
'position': item.get('position'),
'title': item.get('title'),
'url': item.get('url'),
'domain': item.get('domain'),
'snippet': item.get('description'),
'featured_snippet': item.get('is_featured', False),
'site_links': item.get('site_links', [])
})
return pd.DataFrame(serp_results)
# Scrape SERP for keyword
keyword = "best project management software"
df = scrape_google_serp(keyword)
print(f"\n=== SERP Results for '{keyword}' ===")
print(df[['position', 'title', 'domain']])
# Export
df.to_csv(f'serp_{keyword.replace(" ", "_")}.csv', index=False)
print(f"\nExported to serp_{keyword.replace(' ', '_')}.csv")Rank Tracking System
rank_tracker.py
from scrapehub import ScrapeHubClient
import pandas as pd
from datetime import datetime
class RankTracker:
def __init__(self, api_key):
self.client = ScrapeHubClient(api_key=api_key)
self.history_file = 'rank_history.csv'
def check_rankings(self, domain, keywords):
"""Check rankings for domain across multiple keywords"""
rankings = []
for keyword in keywords:
print(f"Checking '{keyword}'...")
search_url = f"https://www.google.com/search?q={keyword.replace(' ', '+')}"
result = self.client.scrape(
url=search_url,
engine="neural-x1",
render_js=True
)
# Find domain position
position = None
for idx, item in enumerate(result.data, 1):
if domain in item.get('url', ''):
position = idx
break
rankings.append({
'timestamp': datetime.now(),
'keyword': keyword,
'domain': domain,
'position': position,
'on_first_page': position <= 10 if position else False
})
if position:
print(f" Position: #{position}")
else:
print(f" Not in top 100")
return pd.DataFrame(rankings)
def save_rankings(self, rankings_df):
"""Save rankings to history"""
try:
existing = pd.read_csv(self.history_file)
combined = pd.concat([existing, rankings_df], ignore_index=True)
except FileNotFoundError:
combined = rankings_df
combined.to_csv(self.history_file, index=False)
print(f"\nSaved {len(rankings_df)} rankings to {self.history_file}")
def get_rank_changes(self, domain):
"""Detect ranking changes"""
df = pd.read_csv(self.history_file)
df['timestamp'] = pd.to_datetime(df['timestamp'])
changes = []
for keyword in df['keyword'].unique():
keyword_data = df[
(df['keyword'] == keyword) &
(df['domain'] == domain)
].sort_values('timestamp')
if len(keyword_data) >= 2:
previous = keyword_data.iloc[-2]
current = keyword_data.iloc[-1]
prev_pos = previous['position']
curr_pos = current['position']
if prev_pos != curr_pos:
# Handle None values
if prev_pos is None:
change_str = f"Entered rankings at #{curr_pos}"
elif curr_pos is None:
change_str = f"Dropped out from #{prev_pos}"
else:
change = prev_pos - curr_pos
change_str = f"Moved from #{prev_pos} to #{curr_pos} ({change:+d})"
changes.append({
'keyword': keyword,
'previous': prev_pos,
'current': curr_pos,
'change': change_str
})
return pd.DataFrame(changes)
# Usage
tracker = RankTracker("sk_live_xxxx_449x")
# Your domain and target keywords
my_domain = "mywebsite.com"
keywords = [
"project management software",
"team collaboration tools",
"agile project management",
"task tracking software"
]
# Check current rankings
rankings = tracker.check_rankings(my_domain, keywords)
tracker.save_rankings(rankings)
# Detect changes
changes = tracker.get_rank_changes(my_domain)
if not changes.empty:
print("\n=== Ranking Changes ===")
for _, change in changes.iterrows():
print(f"\n{change['keyword']}")
print(f" {change['change']}")On-Page SEO Audit
seo_audit.py
from scrapehub import ScrapeHubClient
import pandas as pd
class SEOAuditor:
def __init__(self, api_key):
self.client = ScrapeHubClient(api_key=api_key)
def audit_page(self, url):
"""Perform comprehensive SEO audit of a page"""
result = self.client.scrape(
url=url,
engine="neural-x1",
render_js=True
)
if not result.data:
return None
page = result.data[0]
# Extract SEO elements
audit = {
'url': url,
# Meta tags
'title': page.get('title'),
'title_length': len(page.get('title', '')),
'meta_description': page.get('meta_description'),
'meta_description_length': len(page.get('meta_description', '')),
'meta_keywords': page.get('meta_keywords'),
# Headers
'h1_tags': page.get('h1_tags', []),
'h1_count': len(page.get('h1_tags', [])),
'h2_count': len(page.get('h2_tags', [])),
# Content
'word_count': page.get('word_count', 0),
'images_count': len(page.get('images', [])),
'images_with_alt': len([img for img in page.get('images', []) if img.get('alt')]),
# Links
'internal_links': len(page.get('internal_links', [])),
'external_links': len(page.get('external_links', [])),
# Technical
'has_canonical': bool(page.get('canonical_url')),
'canonical_url': page.get('canonical_url'),
'has_robots_meta': bool(page.get('robots_meta')),
'robots_meta': page.get('robots_meta'),
'has_schema': bool(page.get('schema_markup')),
# Social
'og_title': page.get('og_title'),
'og_description': page.get('og_description'),
'og_image': page.get('og_image'),
'twitter_card': page.get('twitter_card')
}
return audit
def generate_issues(self, audit):
"""Identify SEO issues"""
issues = []
# Title issues
if not audit['title']:
issues.append({'severity': 'high', 'issue': 'Missing title tag'})
elif audit['title_length'] < 30:
issues.append({'severity': 'medium', 'issue': f'Title too short ({audit["title_length"]} chars, recommended 30-60)'})
elif audit['title_length'] > 60:
issues.append({'severity': 'low', 'issue': f'Title too long ({audit["title_length"]} chars, may be truncated)'})
# Meta description issues
if not audit['meta_description']:
issues.append({'severity': 'high', 'issue': 'Missing meta description'})
elif audit['meta_description_length'] < 120:
issues.append({'severity': 'medium', 'issue': f'Meta description too short ({audit["meta_description_length"]} chars)'})
# Header issues
if audit['h1_count'] == 0:
issues.append({'severity': 'high', 'issue': 'Missing H1 tag'})
elif audit['h1_count'] > 1:
issues.append({'severity': 'medium', 'issue': f'Multiple H1 tags ({audit["h1_count"]})'})
# Content issues
if audit['word_count'] < 300:
issues.append({'severity': 'medium', 'issue': f'Thin content ({audit["word_count"]} words)'})
# Image issues
if audit['images_count'] > 0:
missing_alt = audit['images_count'] - audit['images_with_alt']
if missing_alt > 0:
issues.append({'severity': 'medium', 'issue': f'{missing_alt} images missing alt text'})
# Technical issues
if not audit['has_canonical']:
issues.append({'severity': 'low', 'issue': 'Missing canonical tag'})
# Social issues
if not audit['og_title']:
issues.append({'severity': 'low', 'issue': 'Missing Open Graph tags'})
return pd.DataFrame(issues)
def audit_website(self, urls):
"""Audit multiple pages"""
all_audits = []
all_issues = []
for url in urls:
print(f"\nAuditing: {url}")
audit = self.audit_page(url)
if audit:
all_audits.append(audit)
issues = self.generate_issues(audit)
for _, issue in issues.iterrows():
issue_dict = issue.to_dict()
issue_dict['url'] = url
all_issues.append(issue_dict)
print(f" Found {len(issues)} issues")
return pd.DataFrame(all_audits), pd.DataFrame(all_issues)
# Usage
auditor = SEOAuditor("sk_live_xxxx_449x")
# Pages to audit
pages = [
"https://mywebsite.com/",
"https://mywebsite.com/products",
"https://mywebsite.com/about",
"https://mywebsite.com/contact"
]
# Perform audit
audits_df, issues_df = auditor.audit_website(pages)
# Export reports
audits_df.to_excel('seo_audit_report.xlsx', index=False)
issues_df.to_excel('seo_issues.xlsx', index=False)
# Summary
print("\n=== Audit Summary ===")
print(f"Pages audited: {len(audits_df)}")
print(f"Total issues: {len(issues_df)}")
print(f"\nIssues by severity:")
print(issues_df['severity'].value_counts())Competitor Content Analysis
competitor_content.py
from scrapehub import AsyncScrapeHubClient
import asyncio
import pandas as pd
async def analyze_competitor_content(competitor_urls):
"""Analyze content strategy of competitors"""
client = AsyncScrapeHubClient(api_key="sk_live_xxxx_449x")
# Scrape all competitors
tasks = [client.scrape(url=url, engine="neural-x1", render_js=True) for url in competitor_urls]
results = await asyncio.gather(*tasks)
analysis = []
for url, result in zip(competitor_urls, results):
if result.data:
page = result.data[0]
analysis.append({
'url': url,
'domain': url.split('/')[2],
'title': page.get('title'),
'word_count': page.get('word_count', 0),
'h1_tags': len(page.get('h1_tags', [])),
'h2_tags': len(page.get('h2_tags', [])),
'images': len(page.get('images', [])),
'videos': len(page.get('videos', [])),
'internal_links': len(page.get('internal_links', [])),
'external_links': len(page.get('external_links', [])),
'has_faq': bool(page.get('faq_schema')),
'has_howto': bool(page.get('howto_schema')),
'readability_score': page.get('readability_score')
})
return pd.DataFrame(analysis)
# Competitor URLs (top 5 ranking pages for a keyword)
competitors = [
"https://competitor1.com/best-project-tools",
"https://competitor2.com/project-management-guide",
"https://competitor3.com/top-pm-software",
"https://competitor4.com/project-tools-comparison",
"https://competitor5.com/pm-software-review"
]
# Analyze
df = asyncio.run(analyze_competitor_content(competitors))
# Insights
print("=== Competitor Content Analysis ===")
print(f"\nAverage word count: {df['word_count'].mean():.0f}")
print(f"Average images per page: {df['images'].mean():.1f}")
print(f"Average internal links: {df['internal_links'].mean():.1f}")
print(f"Pages with FAQ schema: {df['has_faq'].sum()}")
print("\n=== Top Performers ===")
top_performer = df.loc[df['word_count'].idxmax()]
print(f"Most comprehensive: {top_performer['domain']}")
print(f" Word count: {top_performer['word_count']}")
print(f" Images: {top_performer['images']}")
# Export
df.to_excel('competitor_content_analysis.xlsx', index=False)Keyword Gap Analysis
keyword_gap.py
from scrapehub import ScrapeHubClient
import pandas as pd
class KeywordGapAnalyzer:
def __init__(self, api_key):
self.client = ScrapeHubClient(api_key=api_key)
def get_ranking_keywords(self, domain):
"""Get keywords that a domain ranks for"""
# This would typically use SEO tools API
# For demonstration, scraping sitemap or key pages
sitemap_url = f"https://{domain}/sitemap.xml"
result = self.client.scrape(
url=sitemap_url,
engine="neural-x1"
)
# Extract URLs from sitemap
urls = [item['url'] for item in result.data if 'url' in item]
# For each URL, extract target keywords from content
keywords = []
for url in urls[:10]: # First 10 pages
page_result = self.client.scrape(url=url, engine="neural-x1")
if page_result.data:
page = page_result.data[0]
# Extract keywords from title, h1, meta
page_keywords = self._extract_keywords(page)
keywords.extend(page_keywords)
return list(set(keywords))
def _extract_keywords(self, page):
"""Extract potential keywords from page content"""
keywords = []
# From title
if page.get('title'):
keywords.extend(page['title'].lower().split())
# From H1 tags
for h1 in page.get('h1_tags', []):
keywords.extend(h1.lower().split())
# From meta keywords
if page.get('meta_keywords'):
keywords.extend([k.strip() for k in page['meta_keywords'].split(',')])
return keywords
def find_gaps(self, my_domain, competitor_domains):
"""Find keyword gaps between you and competitors"""
print("Analyzing keyword coverage...")
# Get my keywords
print(f"\nAnalyzing {my_domain}...")
my_keywords = set(self.get_ranking_keywords(my_domain))
print(f" Found {len(my_keywords)} keywords")
# Get competitor keywords
competitor_keywords = {}
for competitor in competitor_domains:
print(f"\nAnalyzing {competitor}...")
keywords = set(self.get_ranking_keywords(competitor))
competitor_keywords[competitor] = keywords
print(f" Found {len(keywords)} keywords")
# Find gaps (keywords competitors rank for but you don't)
all_competitor_keywords = set()
for keywords in competitor_keywords.values():
all_competitor_keywords.update(keywords)
gaps = all_competitor_keywords - my_keywords
# Find opportunities (multiple competitors rank for same keyword)
opportunities = []
for keyword in gaps:
ranking_competitors = [
comp for comp, kw in competitor_keywords.items() if keyword in kw
]
if len(ranking_competitors) >= 2:
opportunities.append({
'keyword': keyword,
'competitors_ranking': len(ranking_competitors),
'competitors': ranking_competitors
})
return pd.DataFrame(opportunities).sort_values('competitors_ranking', ascending=False)
# Usage
analyzer = KeywordGapAnalyzer("sk_live_xxxx_449x")
my_domain = "mywebsite.com"
competitors = [
"competitor1.com",
"competitor2.com",
"competitor3.com"
]
# Find gaps
gaps_df = analyzer.find_gaps(my_domain, competitors)
print("\n=== Keyword Opportunities ===")
print(f"Total gap keywords: {len(gaps_df)}")
print("\nTop opportunities (multiple competitors rank):")
print(gaps_df.head(20))
# Export
gaps_df.to_excel('keyword_gaps.xlsx', index=False)Backlink Profile Analysis
backlink_analysis.py
from scrapehub import ScrapeHubClient
import pandas as pd
class BacklinkAnalyzer:
def __init__(self, api_key):
self.client = ScrapeHubClient(api_key=api_key)
def find_competitor_backlinks(self, competitor_url):
"""Find pages linking to competitor"""
# Use Google search operator to find backlinks
search_query = f"link:{competitor_url}"
search_url = f"https://www.google.com/search?q={search_query}"
result = self.client.scrape(
url=search_url,
engine="neural-x1",
render_js=True
)
backlinks = []
for item in result.data:
backlinks.append({
'linking_domain': item.get('domain'),
'linking_url': item.get('url'),
'anchor_text': item.get('title'),
'snippet': item.get('description')
})
return pd.DataFrame(backlinks)
def analyze_linking_domains(self, backlinks_df):
"""Analyze backlink profile"""
analysis = {
'total_backlinks': len(backlinks_df),
'unique_domains': backlinks_df['linking_domain'].nunique(),
'top_domains': backlinks_df['linking_domain'].value_counts().head(10)
}
return analysis
def find_link_opportunities(self, my_domain, competitor_domains):
"""Find sites that link to competitors but not to you"""
print("Finding link opportunities...")
# Get my backlinks
print(f"\nAnalyzing backlinks to {my_domain}...")
my_backlinks = self.find_competitor_backlinks(my_domain)
my_domains = set(my_backlinks['linking_domain'].unique())
# Get competitor backlinks
all_competitor_backlinks = []
for competitor in competitor_domains:
print(f"Analyzing backlinks to {competitor}...")
backlinks = self.find_competitor_backlinks(competitor)
backlinks['competitor'] = competitor
all_competitor_backlinks.append(backlinks)
competitor_backlinks_df = pd.concat(all_competitor_backlinks, ignore_index=True)
# Find opportunities (domains linking to competitors but not to me)
competitor_domains_set = set(competitor_backlinks_df['linking_domain'].unique())
opportunities = competitor_domains_set - my_domains
# Create opportunities dataframe
opportunities_df = competitor_backlinks_df[
competitor_backlinks_df['linking_domain'].isin(opportunities)
]
return opportunities_df
# Usage
analyzer = BacklinkAnalyzer("sk_live_xxxx_449x")
my_domain = "mywebsite.com"
competitors = ["competitor1.com", "competitor2.com"]
# Find opportunities
opportunities = analyzer.find_link_opportunities(my_domain, competitors)
print("\n=== Link Building Opportunities ===")
print(f"Found {len(opportunities)} potential link opportunities")
print("\nTop domains to target:")
print(opportunities['linking_domain'].value_counts().head(20))
# Export for outreach
opportunities.to_excel('link_opportunities.xlsx', index=False)
print("\nExported to link_opportunities.xlsx")Best Practices
- Track rankings consistently at the same time daily/weekly
- Monitor both organic and featured snippet positions
- Use JavaScript rendering for SERP and dynamic content
- Respect search engine rate limits and robots.txt
- Combine automated data with manual analysis
- Track local rankings from different geographic locations
Key SEO Metrics to Track
Rankings & Visibility
- Keyword positions (top 3, top 10, top 100)
- SERP features captured (featured snippets, PAA)
- Domain authority and page authority
- Organic traffic trends
On-Page Elements
- Title and meta description optimization
- Header structure (H1-H6)
- Content depth and quality
- Internal linking structure
Technical SEO
- Page load speed
- Mobile responsiveness
- Structured data implementation
- Canonical tags and redirects
Competitive Analysis
- Competitor keyword rankings
- Content gap identification
- Backlink profile comparison
- SERP feature ownership