SEO Data Extraction Guide

Extract SERP data, keywords, and competitor SEO metrics

What You'll Learn

Scrape Google SERP results and rankings
Extract meta tags, headers, and on-page SEO elements
Monitor keyword positions and rank tracking
Analyze competitor backlinks and content
Build automated SEO auditing tools

SERP Scraping (Search Results)

scrape_serp.py

python

from scrapehub import ScrapeHubClient
import pandas as pd

client = ScrapeHubClient(api_key="sk_live_xxxx_449x")

def scrape_google_serp(keyword, location='United States'):
    """Scrape Google search results for a keyword"""

    # Build Google search URL
    search_url = f"https://www.google.com/search?q={keyword.replace(' ', '+')}"

    result = client.scrape(
        url=search_url,
        engine="neural-x1",
        render_js=True,
        wait_for_selector="#search"
    )

    # Extract SERP features
    serp_results = []

    for item in result.data:
        serp_results.append({
            'keyword': keyword,
            'position': item.get('position'),
            'title': item.get('title'),
            'url': item.get('url'),
            'domain': item.get('domain'),
            'snippet': item.get('description'),
            'featured_snippet': item.get('is_featured', False),
            'site_links': item.get('site_links', [])
        })

    return pd.DataFrame(serp_results)

# Scrape SERP for keyword
keyword = "best project management software"
df = scrape_google_serp(keyword)

print(f"\n=== SERP Results for '{keyword}' ===")
print(df[['position', 'title', 'domain']])

# Export
df.to_csv(f'serp_{keyword.replace(" ", "_")}.csv', index=False)
print(f"\nExported to serp_{keyword.replace(' ', '_')}.csv")

Rank Tracking System

rank_tracker.py

python

from scrapehub import ScrapeHubClient
import pandas as pd
from datetime import datetime

class RankTracker:
    def __init__(self, api_key):
        self.client = ScrapeHubClient(api_key=api_key)
        self.history_file = 'rank_history.csv'

    def check_rankings(self, domain, keywords):
        """Check rankings for domain across multiple keywords"""

        rankings = []

        for keyword in keywords:
            print(f"Checking '{keyword}'...")

            search_url = f"https://www.google.com/search?q={keyword.replace(' ', '+')}"

            result = self.client.scrape(
                url=search_url,
                engine="neural-x1",
                render_js=True
            )

            # Find domain position
            position = None
            for idx, item in enumerate(result.data, 1):
                if domain in item.get('url', ''):
                    position = idx
                    break

            rankings.append({
                'timestamp': datetime.now(),
                'keyword': keyword,
                'domain': domain,
                'position': position,
                'on_first_page': position <= 10 if position else False
            })

            if position:
                print(f"  Position: #{position}")
            else:
                print(f"  Not in top 100")

        return pd.DataFrame(rankings)

    def save_rankings(self, rankings_df):
        """Save rankings to history"""
        try:
            existing = pd.read_csv(self.history_file)
            combined = pd.concat([existing, rankings_df], ignore_index=True)
        except FileNotFoundError:
            combined = rankings_df

        combined.to_csv(self.history_file, index=False)
        print(f"\nSaved {len(rankings_df)} rankings to {self.history_file}")

    def get_rank_changes(self, domain):
        """Detect ranking changes"""
        df = pd.read_csv(self.history_file)
        df['timestamp'] = pd.to_datetime(df['timestamp'])

        changes = []

        for keyword in df['keyword'].unique():
            keyword_data = df[
                (df['keyword'] == keyword) &
                (df['domain'] == domain)
            ].sort_values('timestamp')

            if len(keyword_data) >= 2:
                previous = keyword_data.iloc[-2]
                current = keyword_data.iloc[-1]

                prev_pos = previous['position']
                curr_pos = current['position']

                if prev_pos != curr_pos:
                    # Handle None values
                    if prev_pos is None:
                        change_str = f"Entered rankings at #{curr_pos}"
                    elif curr_pos is None:
                        change_str = f"Dropped out from #{prev_pos}"
                    else:
                        change = prev_pos - curr_pos
                        change_str = f"Moved from #{prev_pos} to #{curr_pos} ({change:+d})"

                    changes.append({
                        'keyword': keyword,
                        'previous': prev_pos,
                        'current': curr_pos,
                        'change': change_str
                    })

        return pd.DataFrame(changes)

# Usage
tracker = RankTracker("sk_live_xxxx_449x")

# Your domain and target keywords
my_domain = "mywebsite.com"
keywords = [
    "project management software",
    "team collaboration tools",
    "agile project management",
    "task tracking software"
]

# Check current rankings
rankings = tracker.check_rankings(my_domain, keywords)
tracker.save_rankings(rankings)

# Detect changes
changes = tracker.get_rank_changes(my_domain)
if not changes.empty:
    print("\n=== Ranking Changes ===")
    for _, change in changes.iterrows():
        print(f"\n{change['keyword']}")
        print(f"  {change['change']}")

On-Page SEO Audit

seo_audit.py

python

from scrapehub import ScrapeHubClient
import pandas as pd

class SEOAuditor:
    def __init__(self, api_key):
        self.client = ScrapeHubClient(api_key=api_key)

    def audit_page(self, url):
        """Perform comprehensive SEO audit of a page"""

        result = self.client.scrape(
            url=url,
            engine="neural-x1",
            render_js=True
        )

        if not result.data:
            return None

        page = result.data[0]

        # Extract SEO elements
        audit = {
            'url': url,

            # Meta tags
            'title': page.get('title'),
            'title_length': len(page.get('title', '')),
            'meta_description': page.get('meta_description'),
            'meta_description_length': len(page.get('meta_description', '')),
            'meta_keywords': page.get('meta_keywords'),

            # Headers
            'h1_tags': page.get('h1_tags', []),
            'h1_count': len(page.get('h1_tags', [])),
            'h2_count': len(page.get('h2_tags', [])),

            # Content
            'word_count': page.get('word_count', 0),
            'images_count': len(page.get('images', [])),
            'images_with_alt': len([img for img in page.get('images', []) if img.get('alt')]),

            # Links
            'internal_links': len(page.get('internal_links', [])),
            'external_links': len(page.get('external_links', [])),

            # Technical
            'has_canonical': bool(page.get('canonical_url')),
            'canonical_url': page.get('canonical_url'),
            'has_robots_meta': bool(page.get('robots_meta')),
            'robots_meta': page.get('robots_meta'),
            'has_schema': bool(page.get('schema_markup')),

            # Social
            'og_title': page.get('og_title'),
            'og_description': page.get('og_description'),
            'og_image': page.get('og_image'),
            'twitter_card': page.get('twitter_card')
        }

        return audit

    def generate_issues(self, audit):
        """Identify SEO issues"""
        issues = []

        # Title issues
        if not audit['title']:
            issues.append({'severity': 'high', 'issue': 'Missing title tag'})
        elif audit['title_length'] < 30:
            issues.append({'severity': 'medium', 'issue': f'Title too short ({audit["title_length"]} chars, recommended 30-60)'})
        elif audit['title_length'] > 60:
            issues.append({'severity': 'low', 'issue': f'Title too long ({audit["title_length"]} chars, may be truncated)'})

        # Meta description issues
        if not audit['meta_description']:
            issues.append({'severity': 'high', 'issue': 'Missing meta description'})
        elif audit['meta_description_length'] < 120:
            issues.append({'severity': 'medium', 'issue': f'Meta description too short ({audit["meta_description_length"]} chars)'})

        # Header issues
        if audit['h1_count'] == 0:
            issues.append({'severity': 'high', 'issue': 'Missing H1 tag'})
        elif audit['h1_count'] > 1:
            issues.append({'severity': 'medium', 'issue': f'Multiple H1 tags ({audit["h1_count"]})'})

        # Content issues
        if audit['word_count'] < 300:
            issues.append({'severity': 'medium', 'issue': f'Thin content ({audit["word_count"]} words)'})

        # Image issues
        if audit['images_count'] > 0:
            missing_alt = audit['images_count'] - audit['images_with_alt']
            if missing_alt > 0:
                issues.append({'severity': 'medium', 'issue': f'{missing_alt} images missing alt text'})

        # Technical issues
        if not audit['has_canonical']:
            issues.append({'severity': 'low', 'issue': 'Missing canonical tag'})

        # Social issues
        if not audit['og_title']:
            issues.append({'severity': 'low', 'issue': 'Missing Open Graph tags'})

        return pd.DataFrame(issues)

    def audit_website(self, urls):
        """Audit multiple pages"""
        all_audits = []
        all_issues = []

        for url in urls:
            print(f"\nAuditing: {url}")

            audit = self.audit_page(url)
            if audit:
                all_audits.append(audit)

                issues = self.generate_issues(audit)
                for _, issue in issues.iterrows():
                    issue_dict = issue.to_dict()
                    issue_dict['url'] = url
                    all_issues.append(issue_dict)

                print(f"  Found {len(issues)} issues")

        return pd.DataFrame(all_audits), pd.DataFrame(all_issues)

# Usage
auditor = SEOAuditor("sk_live_xxxx_449x")

# Pages to audit
pages = [
    "https://mywebsite.com/",
    "https://mywebsite.com/products",
    "https://mywebsite.com/about",
    "https://mywebsite.com/contact"
]

# Perform audit
audits_df, issues_df = auditor.audit_website(pages)

# Export reports
audits_df.to_excel('seo_audit_report.xlsx', index=False)
issues_df.to_excel('seo_issues.xlsx', index=False)

# Summary
print("\n=== Audit Summary ===")
print(f"Pages audited: {len(audits_df)}")
print(f"Total issues: {len(issues_df)}")
print(f"\nIssues by severity:")
print(issues_df['severity'].value_counts())

Competitor Content Analysis

competitor_content.py

python

from scrapehub import AsyncScrapeHubClient
import asyncio
import pandas as pd

async def analyze_competitor_content(competitor_urls):
    """Analyze content strategy of competitors"""
    client = AsyncScrapeHubClient(api_key="sk_live_xxxx_449x")

    # Scrape all competitors
    tasks = [client.scrape(url=url, engine="neural-x1", render_js=True) for url in competitor_urls]
    results = await asyncio.gather(*tasks)

    analysis = []

    for url, result in zip(competitor_urls, results):
        if result.data:
            page = result.data[0]

            analysis.append({
                'url': url,
                'domain': url.split('/')[2],
                'title': page.get('title'),
                'word_count': page.get('word_count', 0),
                'h1_tags': len(page.get('h1_tags', [])),
                'h2_tags': len(page.get('h2_tags', [])),
                'images': len(page.get('images', [])),
                'videos': len(page.get('videos', [])),
                'internal_links': len(page.get('internal_links', [])),
                'external_links': len(page.get('external_links', [])),
                'has_faq': bool(page.get('faq_schema')),
                'has_howto': bool(page.get('howto_schema')),
                'readability_score': page.get('readability_score')
            })

    return pd.DataFrame(analysis)

# Competitor URLs (top 5 ranking pages for a keyword)
competitors = [
    "https://competitor1.com/best-project-tools",
    "https://competitor2.com/project-management-guide",
    "https://competitor3.com/top-pm-software",
    "https://competitor4.com/project-tools-comparison",
    "https://competitor5.com/pm-software-review"
]

# Analyze
df = asyncio.run(analyze_competitor_content(competitors))

# Insights
print("=== Competitor Content Analysis ===")
print(f"\nAverage word count: {df['word_count'].mean():.0f}")
print(f"Average images per page: {df['images'].mean():.1f}")
print(f"Average internal links: {df['internal_links'].mean():.1f}")
print(f"Pages with FAQ schema: {df['has_faq'].sum()}")

print("\n=== Top Performers ===")
top_performer = df.loc[df['word_count'].idxmax()]
print(f"Most comprehensive: {top_performer['domain']}")
print(f"  Word count: {top_performer['word_count']}")
print(f"  Images: {top_performer['images']}")

# Export
df.to_excel('competitor_content_analysis.xlsx', index=False)

Keyword Gap Analysis

keyword_gap.py

python

from scrapehub import ScrapeHubClient
import pandas as pd

class KeywordGapAnalyzer:
    def __init__(self, api_key):
        self.client = ScrapeHubClient(api_key=api_key)

    def get_ranking_keywords(self, domain):
        """Get keywords that a domain ranks for"""
        # This would typically use SEO tools API
        # For demonstration, scraping sitemap or key pages

        sitemap_url = f"https://{domain}/sitemap.xml"

        result = self.client.scrape(
            url=sitemap_url,
            engine="neural-x1"
        )

        # Extract URLs from sitemap
        urls = [item['url'] for item in result.data if 'url' in item]

        # For each URL, extract target keywords from content
        keywords = []

        for url in urls[:10]:  # First 10 pages
            page_result = self.client.scrape(url=url, engine="neural-x1")

            if page_result.data:
                page = page_result.data[0]
                # Extract keywords from title, h1, meta
                page_keywords = self._extract_keywords(page)
                keywords.extend(page_keywords)

        return list(set(keywords))

    def _extract_keywords(self, page):
        """Extract potential keywords from page content"""
        keywords = []

        # From title
        if page.get('title'):
            keywords.extend(page['title'].lower().split())

        # From H1 tags
        for h1 in page.get('h1_tags', []):
            keywords.extend(h1.lower().split())

        # From meta keywords
        if page.get('meta_keywords'):
            keywords.extend([k.strip() for k in page['meta_keywords'].split(',')])

        return keywords

    def find_gaps(self, my_domain, competitor_domains):
        """Find keyword gaps between you and competitors"""

        print("Analyzing keyword coverage...")

        # Get my keywords
        print(f"\nAnalyzing {my_domain}...")
        my_keywords = set(self.get_ranking_keywords(my_domain))
        print(f"  Found {len(my_keywords)} keywords")

        # Get competitor keywords
        competitor_keywords = {}
        for competitor in competitor_domains:
            print(f"\nAnalyzing {competitor}...")
            keywords = set(self.get_ranking_keywords(competitor))
            competitor_keywords[competitor] = keywords
            print(f"  Found {len(keywords)} keywords")

        # Find gaps (keywords competitors rank for but you don't)
        all_competitor_keywords = set()
        for keywords in competitor_keywords.values():
            all_competitor_keywords.update(keywords)

        gaps = all_competitor_keywords - my_keywords

        # Find opportunities (multiple competitors rank for same keyword)
        opportunities = []
        for keyword in gaps:
            ranking_competitors = [
                comp for comp, kw in competitor_keywords.items() if keyword in kw
            ]
            if len(ranking_competitors) >= 2:
                opportunities.append({
                    'keyword': keyword,
                    'competitors_ranking': len(ranking_competitors),
                    'competitors': ranking_competitors
                })

        return pd.DataFrame(opportunities).sort_values('competitors_ranking', ascending=False)

# Usage
analyzer = KeywordGapAnalyzer("sk_live_xxxx_449x")

my_domain = "mywebsite.com"
competitors = [
    "competitor1.com",
    "competitor2.com",
    "competitor3.com"
]

# Find gaps
gaps_df = analyzer.find_gaps(my_domain, competitors)

print("\n=== Keyword Opportunities ===")
print(f"Total gap keywords: {len(gaps_df)}")
print("\nTop opportunities (multiple competitors rank):")
print(gaps_df.head(20))

# Export
gaps_df.to_excel('keyword_gaps.xlsx', index=False)

Backlink Profile Analysis

backlink_analysis.py

python

from scrapehub import ScrapeHubClient
import pandas as pd

class BacklinkAnalyzer:
    def __init__(self, api_key):
        self.client = ScrapeHubClient(api_key=api_key)

    def find_competitor_backlinks(self, competitor_url):
        """Find pages linking to competitor"""

        # Use Google search operator to find backlinks
        search_query = f"link:{competitor_url}"
        search_url = f"https://www.google.com/search?q={search_query}"

        result = self.client.scrape(
            url=search_url,
            engine="neural-x1",
            render_js=True
        )

        backlinks = []

        for item in result.data:
            backlinks.append({
                'linking_domain': item.get('domain'),
                'linking_url': item.get('url'),
                'anchor_text': item.get('title'),
                'snippet': item.get('description')
            })

        return pd.DataFrame(backlinks)

    def analyze_linking_domains(self, backlinks_df):
        """Analyze backlink profile"""

        analysis = {
            'total_backlinks': len(backlinks_df),
            'unique_domains': backlinks_df['linking_domain'].nunique(),
            'top_domains': backlinks_df['linking_domain'].value_counts().head(10)
        }

        return analysis

    def find_link_opportunities(self, my_domain, competitor_domains):
        """Find sites that link to competitors but not to you"""

        print("Finding link opportunities...")

        # Get my backlinks
        print(f"\nAnalyzing backlinks to {my_domain}...")
        my_backlinks = self.find_competitor_backlinks(my_domain)
        my_domains = set(my_backlinks['linking_domain'].unique())

        # Get competitor backlinks
        all_competitor_backlinks = []

        for competitor in competitor_domains:
            print(f"Analyzing backlinks to {competitor}...")
            backlinks = self.find_competitor_backlinks(competitor)
            backlinks['competitor'] = competitor
            all_competitor_backlinks.append(backlinks)

        competitor_backlinks_df = pd.concat(all_competitor_backlinks, ignore_index=True)

        # Find opportunities (domains linking to competitors but not to me)
        competitor_domains_set = set(competitor_backlinks_df['linking_domain'].unique())
        opportunities = competitor_domains_set - my_domains

        # Create opportunities dataframe
        opportunities_df = competitor_backlinks_df[
            competitor_backlinks_df['linking_domain'].isin(opportunities)
        ]

        return opportunities_df

# Usage
analyzer = BacklinkAnalyzer("sk_live_xxxx_449x")

my_domain = "mywebsite.com"
competitors = ["competitor1.com", "competitor2.com"]

# Find opportunities
opportunities = analyzer.find_link_opportunities(my_domain, competitors)

print("\n=== Link Building Opportunities ===")
print(f"Found {len(opportunities)} potential link opportunities")
print("\nTop domains to target:")
print(opportunities['linking_domain'].value_counts().head(20))

# Export for outreach
opportunities.to_excel('link_opportunities.xlsx', index=False)
print("\nExported to link_opportunities.xlsx")

Best Practices

Track rankings consistently at the same time daily/weekly
Monitor both organic and featured snippet positions
Use JavaScript rendering for SERP and dynamic content
Respect search engine rate limits and robots.txt
Combine automated data with manual analysis
Track local rankings from different geographic locations

Key SEO Metrics to Track

Rankings & Visibility

Keyword positions (top 3, top 10, top 100)
SERP features captured (featured snippets, PAA)
Domain authority and page authority
Organic traffic trends

On-Page Elements

Title and meta description optimization
Header structure (H1-H6)
Content depth and quality
Internal linking structure

Technical SEO

Page load speed
Mobile responsiveness
Structured data implementation
Canonical tags and redirects

Competitive Analysis

Competitor keyword rankings
Content gap identification
Backlink profile comparison
SERP feature ownership