SEO Data Extraction Guide

Extract SERP data, keywords, and competitor SEO metrics

What You'll Learn

  • Scrape Google SERP results and rankings
  • Extract meta tags, headers, and on-page SEO elements
  • Monitor keyword positions and rank tracking
  • Analyze competitor backlinks and content
  • Build automated SEO auditing tools

SERP Scraping (Search Results)

scrape_serp.py
python
from scrapehub import ScrapeHubClient
import pandas as pd

client = ScrapeHubClient(api_key="sk_live_xxxx_449x")

def scrape_google_serp(keyword, location='United States'):
    """Scrape Google search results for a keyword"""

    # Build Google search URL
    search_url = f"https://www.google.com/search?q={keyword.replace(' ', '+')}"

    result = client.scrape(
        url=search_url,
        engine="neural-x1",
        render_js=True,
        wait_for_selector="#search"
    )

    # Extract SERP features
    serp_results = []

    for item in result.data:
        serp_results.append({
            'keyword': keyword,
            'position': item.get('position'),
            'title': item.get('title'),
            'url': item.get('url'),
            'domain': item.get('domain'),
            'snippet': item.get('description'),
            'featured_snippet': item.get('is_featured', False),
            'site_links': item.get('site_links', [])
        })

    return pd.DataFrame(serp_results)

# Scrape SERP for keyword
keyword = "best project management software"
df = scrape_google_serp(keyword)

print(f"\n=== SERP Results for '{keyword}' ===")
print(df[['position', 'title', 'domain']])

# Export
df.to_csv(f'serp_{keyword.replace(" ", "_")}.csv', index=False)
print(f"\nExported to serp_{keyword.replace(' ', '_')}.csv")

Rank Tracking System

rank_tracker.py
python
from scrapehub import ScrapeHubClient
import pandas as pd
from datetime import datetime

class RankTracker:
    def __init__(self, api_key):
        self.client = ScrapeHubClient(api_key=api_key)
        self.history_file = 'rank_history.csv'

    def check_rankings(self, domain, keywords):
        """Check rankings for domain across multiple keywords"""

        rankings = []

        for keyword in keywords:
            print(f"Checking '{keyword}'...")

            search_url = f"https://www.google.com/search?q={keyword.replace(' ', '+')}"

            result = self.client.scrape(
                url=search_url,
                engine="neural-x1",
                render_js=True
            )

            # Find domain position
            position = None
            for idx, item in enumerate(result.data, 1):
                if domain in item.get('url', ''):
                    position = idx
                    break

            rankings.append({
                'timestamp': datetime.now(),
                'keyword': keyword,
                'domain': domain,
                'position': position,
                'on_first_page': position <= 10 if position else False
            })

            if position:
                print(f"  Position: #{position}")
            else:
                print(f"  Not in top 100")

        return pd.DataFrame(rankings)

    def save_rankings(self, rankings_df):
        """Save rankings to history"""
        try:
            existing = pd.read_csv(self.history_file)
            combined = pd.concat([existing, rankings_df], ignore_index=True)
        except FileNotFoundError:
            combined = rankings_df

        combined.to_csv(self.history_file, index=False)
        print(f"\nSaved {len(rankings_df)} rankings to {self.history_file}")

    def get_rank_changes(self, domain):
        """Detect ranking changes"""
        df = pd.read_csv(self.history_file)
        df['timestamp'] = pd.to_datetime(df['timestamp'])

        changes = []

        for keyword in df['keyword'].unique():
            keyword_data = df[
                (df['keyword'] == keyword) &
                (df['domain'] == domain)
            ].sort_values('timestamp')

            if len(keyword_data) >= 2:
                previous = keyword_data.iloc[-2]
                current = keyword_data.iloc[-1]

                prev_pos = previous['position']
                curr_pos = current['position']

                if prev_pos != curr_pos:
                    # Handle None values
                    if prev_pos is None:
                        change_str = f"Entered rankings at #{curr_pos}"
                    elif curr_pos is None:
                        change_str = f"Dropped out from #{prev_pos}"
                    else:
                        change = prev_pos - curr_pos
                        change_str = f"Moved from #{prev_pos} to #{curr_pos} ({change:+d})"

                    changes.append({
                        'keyword': keyword,
                        'previous': prev_pos,
                        'current': curr_pos,
                        'change': change_str
                    })

        return pd.DataFrame(changes)

# Usage
tracker = RankTracker("sk_live_xxxx_449x")

# Your domain and target keywords
my_domain = "mywebsite.com"
keywords = [
    "project management software",
    "team collaboration tools",
    "agile project management",
    "task tracking software"
]

# Check current rankings
rankings = tracker.check_rankings(my_domain, keywords)
tracker.save_rankings(rankings)

# Detect changes
changes = tracker.get_rank_changes(my_domain)
if not changes.empty:
    print("\n=== Ranking Changes ===")
    for _, change in changes.iterrows():
        print(f"\n{change['keyword']}")
        print(f"  {change['change']}")

On-Page SEO Audit

seo_audit.py
python
from scrapehub import ScrapeHubClient
import pandas as pd

class SEOAuditor:
    def __init__(self, api_key):
        self.client = ScrapeHubClient(api_key=api_key)

    def audit_page(self, url):
        """Perform comprehensive SEO audit of a page"""

        result = self.client.scrape(
            url=url,
            engine="neural-x1",
            render_js=True
        )

        if not result.data:
            return None

        page = result.data[0]

        # Extract SEO elements
        audit = {
            'url': url,

            # Meta tags
            'title': page.get('title'),
            'title_length': len(page.get('title', '')),
            'meta_description': page.get('meta_description'),
            'meta_description_length': len(page.get('meta_description', '')),
            'meta_keywords': page.get('meta_keywords'),

            # Headers
            'h1_tags': page.get('h1_tags', []),
            'h1_count': len(page.get('h1_tags', [])),
            'h2_count': len(page.get('h2_tags', [])),

            # Content
            'word_count': page.get('word_count', 0),
            'images_count': len(page.get('images', [])),
            'images_with_alt': len([img for img in page.get('images', []) if img.get('alt')]),

            # Links
            'internal_links': len(page.get('internal_links', [])),
            'external_links': len(page.get('external_links', [])),

            # Technical
            'has_canonical': bool(page.get('canonical_url')),
            'canonical_url': page.get('canonical_url'),
            'has_robots_meta': bool(page.get('robots_meta')),
            'robots_meta': page.get('robots_meta'),
            'has_schema': bool(page.get('schema_markup')),

            # Social
            'og_title': page.get('og_title'),
            'og_description': page.get('og_description'),
            'og_image': page.get('og_image'),
            'twitter_card': page.get('twitter_card')
        }

        return audit

    def generate_issues(self, audit):
        """Identify SEO issues"""
        issues = []

        # Title issues
        if not audit['title']:
            issues.append({'severity': 'high', 'issue': 'Missing title tag'})
        elif audit['title_length'] < 30:
            issues.append({'severity': 'medium', 'issue': f'Title too short ({audit["title_length"]} chars, recommended 30-60)'})
        elif audit['title_length'] > 60:
            issues.append({'severity': 'low', 'issue': f'Title too long ({audit["title_length"]} chars, may be truncated)'})

        # Meta description issues
        if not audit['meta_description']:
            issues.append({'severity': 'high', 'issue': 'Missing meta description'})
        elif audit['meta_description_length'] < 120:
            issues.append({'severity': 'medium', 'issue': f'Meta description too short ({audit["meta_description_length"]} chars)'})

        # Header issues
        if audit['h1_count'] == 0:
            issues.append({'severity': 'high', 'issue': 'Missing H1 tag'})
        elif audit['h1_count'] > 1:
            issues.append({'severity': 'medium', 'issue': f'Multiple H1 tags ({audit["h1_count"]})'})

        # Content issues
        if audit['word_count'] < 300:
            issues.append({'severity': 'medium', 'issue': f'Thin content ({audit["word_count"]} words)'})

        # Image issues
        if audit['images_count'] > 0:
            missing_alt = audit['images_count'] - audit['images_with_alt']
            if missing_alt > 0:
                issues.append({'severity': 'medium', 'issue': f'{missing_alt} images missing alt text'})

        # Technical issues
        if not audit['has_canonical']:
            issues.append({'severity': 'low', 'issue': 'Missing canonical tag'})

        # Social issues
        if not audit['og_title']:
            issues.append({'severity': 'low', 'issue': 'Missing Open Graph tags'})

        return pd.DataFrame(issues)

    def audit_website(self, urls):
        """Audit multiple pages"""
        all_audits = []
        all_issues = []

        for url in urls:
            print(f"\nAuditing: {url}")

            audit = self.audit_page(url)
            if audit:
                all_audits.append(audit)

                issues = self.generate_issues(audit)
                for _, issue in issues.iterrows():
                    issue_dict = issue.to_dict()
                    issue_dict['url'] = url
                    all_issues.append(issue_dict)

                print(f"  Found {len(issues)} issues")

        return pd.DataFrame(all_audits), pd.DataFrame(all_issues)

# Usage
auditor = SEOAuditor("sk_live_xxxx_449x")

# Pages to audit
pages = [
    "https://mywebsite.com/",
    "https://mywebsite.com/products",
    "https://mywebsite.com/about",
    "https://mywebsite.com/contact"
]

# Perform audit
audits_df, issues_df = auditor.audit_website(pages)

# Export reports
audits_df.to_excel('seo_audit_report.xlsx', index=False)
issues_df.to_excel('seo_issues.xlsx', index=False)

# Summary
print("\n=== Audit Summary ===")
print(f"Pages audited: {len(audits_df)}")
print(f"Total issues: {len(issues_df)}")
print(f"\nIssues by severity:")
print(issues_df['severity'].value_counts())

Competitor Content Analysis

competitor_content.py
python
from scrapehub import AsyncScrapeHubClient
import asyncio
import pandas as pd

async def analyze_competitor_content(competitor_urls):
    """Analyze content strategy of competitors"""
    client = AsyncScrapeHubClient(api_key="sk_live_xxxx_449x")

    # Scrape all competitors
    tasks = [client.scrape(url=url, engine="neural-x1", render_js=True) for url in competitor_urls]
    results = await asyncio.gather(*tasks)

    analysis = []

    for url, result in zip(competitor_urls, results):
        if result.data:
            page = result.data[0]

            analysis.append({
                'url': url,
                'domain': url.split('/')[2],
                'title': page.get('title'),
                'word_count': page.get('word_count', 0),
                'h1_tags': len(page.get('h1_tags', [])),
                'h2_tags': len(page.get('h2_tags', [])),
                'images': len(page.get('images', [])),
                'videos': len(page.get('videos', [])),
                'internal_links': len(page.get('internal_links', [])),
                'external_links': len(page.get('external_links', [])),
                'has_faq': bool(page.get('faq_schema')),
                'has_howto': bool(page.get('howto_schema')),
                'readability_score': page.get('readability_score')
            })

    return pd.DataFrame(analysis)

# Competitor URLs (top 5 ranking pages for a keyword)
competitors = [
    "https://competitor1.com/best-project-tools",
    "https://competitor2.com/project-management-guide",
    "https://competitor3.com/top-pm-software",
    "https://competitor4.com/project-tools-comparison",
    "https://competitor5.com/pm-software-review"
]

# Analyze
df = asyncio.run(analyze_competitor_content(competitors))

# Insights
print("=== Competitor Content Analysis ===")
print(f"\nAverage word count: {df['word_count'].mean():.0f}")
print(f"Average images per page: {df['images'].mean():.1f}")
print(f"Average internal links: {df['internal_links'].mean():.1f}")
print(f"Pages with FAQ schema: {df['has_faq'].sum()}")

print("\n=== Top Performers ===")
top_performer = df.loc[df['word_count'].idxmax()]
print(f"Most comprehensive: {top_performer['domain']}")
print(f"  Word count: {top_performer['word_count']}")
print(f"  Images: {top_performer['images']}")

# Export
df.to_excel('competitor_content_analysis.xlsx', index=False)

Keyword Gap Analysis

keyword_gap.py
python
from scrapehub import ScrapeHubClient
import pandas as pd

class KeywordGapAnalyzer:
    def __init__(self, api_key):
        self.client = ScrapeHubClient(api_key=api_key)

    def get_ranking_keywords(self, domain):
        """Get keywords that a domain ranks for"""
        # This would typically use SEO tools API
        # For demonstration, scraping sitemap or key pages

        sitemap_url = f"https://{domain}/sitemap.xml"

        result = self.client.scrape(
            url=sitemap_url,
            engine="neural-x1"
        )

        # Extract URLs from sitemap
        urls = [item['url'] for item in result.data if 'url' in item]

        # For each URL, extract target keywords from content
        keywords = []

        for url in urls[:10]:  # First 10 pages
            page_result = self.client.scrape(url=url, engine="neural-x1")

            if page_result.data:
                page = page_result.data[0]
                # Extract keywords from title, h1, meta
                page_keywords = self._extract_keywords(page)
                keywords.extend(page_keywords)

        return list(set(keywords))

    def _extract_keywords(self, page):
        """Extract potential keywords from page content"""
        keywords = []

        # From title
        if page.get('title'):
            keywords.extend(page['title'].lower().split())

        # From H1 tags
        for h1 in page.get('h1_tags', []):
            keywords.extend(h1.lower().split())

        # From meta keywords
        if page.get('meta_keywords'):
            keywords.extend([k.strip() for k in page['meta_keywords'].split(',')])

        return keywords

    def find_gaps(self, my_domain, competitor_domains):
        """Find keyword gaps between you and competitors"""

        print("Analyzing keyword coverage...")

        # Get my keywords
        print(f"\nAnalyzing {my_domain}...")
        my_keywords = set(self.get_ranking_keywords(my_domain))
        print(f"  Found {len(my_keywords)} keywords")

        # Get competitor keywords
        competitor_keywords = {}
        for competitor in competitor_domains:
            print(f"\nAnalyzing {competitor}...")
            keywords = set(self.get_ranking_keywords(competitor))
            competitor_keywords[competitor] = keywords
            print(f"  Found {len(keywords)} keywords")

        # Find gaps (keywords competitors rank for but you don't)
        all_competitor_keywords = set()
        for keywords in competitor_keywords.values():
            all_competitor_keywords.update(keywords)

        gaps = all_competitor_keywords - my_keywords

        # Find opportunities (multiple competitors rank for same keyword)
        opportunities = []
        for keyword in gaps:
            ranking_competitors = [
                comp for comp, kw in competitor_keywords.items() if keyword in kw
            ]
            if len(ranking_competitors) >= 2:
                opportunities.append({
                    'keyword': keyword,
                    'competitors_ranking': len(ranking_competitors),
                    'competitors': ranking_competitors
                })

        return pd.DataFrame(opportunities).sort_values('competitors_ranking', ascending=False)

# Usage
analyzer = KeywordGapAnalyzer("sk_live_xxxx_449x")

my_domain = "mywebsite.com"
competitors = [
    "competitor1.com",
    "competitor2.com",
    "competitor3.com"
]

# Find gaps
gaps_df = analyzer.find_gaps(my_domain, competitors)

print("\n=== Keyword Opportunities ===")
print(f"Total gap keywords: {len(gaps_df)}")
print("\nTop opportunities (multiple competitors rank):")
print(gaps_df.head(20))

# Export
gaps_df.to_excel('keyword_gaps.xlsx', index=False)

Backlink Profile Analysis

backlink_analysis.py
python
from scrapehub import ScrapeHubClient
import pandas as pd

class BacklinkAnalyzer:
    def __init__(self, api_key):
        self.client = ScrapeHubClient(api_key=api_key)

    def find_competitor_backlinks(self, competitor_url):
        """Find pages linking to competitor"""

        # Use Google search operator to find backlinks
        search_query = f"link:{competitor_url}"
        search_url = f"https://www.google.com/search?q={search_query}"

        result = self.client.scrape(
            url=search_url,
            engine="neural-x1",
            render_js=True
        )

        backlinks = []

        for item in result.data:
            backlinks.append({
                'linking_domain': item.get('domain'),
                'linking_url': item.get('url'),
                'anchor_text': item.get('title'),
                'snippet': item.get('description')
            })

        return pd.DataFrame(backlinks)

    def analyze_linking_domains(self, backlinks_df):
        """Analyze backlink profile"""

        analysis = {
            'total_backlinks': len(backlinks_df),
            'unique_domains': backlinks_df['linking_domain'].nunique(),
            'top_domains': backlinks_df['linking_domain'].value_counts().head(10)
        }

        return analysis

    def find_link_opportunities(self, my_domain, competitor_domains):
        """Find sites that link to competitors but not to you"""

        print("Finding link opportunities...")

        # Get my backlinks
        print(f"\nAnalyzing backlinks to {my_domain}...")
        my_backlinks = self.find_competitor_backlinks(my_domain)
        my_domains = set(my_backlinks['linking_domain'].unique())

        # Get competitor backlinks
        all_competitor_backlinks = []

        for competitor in competitor_domains:
            print(f"Analyzing backlinks to {competitor}...")
            backlinks = self.find_competitor_backlinks(competitor)
            backlinks['competitor'] = competitor
            all_competitor_backlinks.append(backlinks)

        competitor_backlinks_df = pd.concat(all_competitor_backlinks, ignore_index=True)

        # Find opportunities (domains linking to competitors but not to me)
        competitor_domains_set = set(competitor_backlinks_df['linking_domain'].unique())
        opportunities = competitor_domains_set - my_domains

        # Create opportunities dataframe
        opportunities_df = competitor_backlinks_df[
            competitor_backlinks_df['linking_domain'].isin(opportunities)
        ]

        return opportunities_df

# Usage
analyzer = BacklinkAnalyzer("sk_live_xxxx_449x")

my_domain = "mywebsite.com"
competitors = ["competitor1.com", "competitor2.com"]

# Find opportunities
opportunities = analyzer.find_link_opportunities(my_domain, competitors)

print("\n=== Link Building Opportunities ===")
print(f"Found {len(opportunities)} potential link opportunities")
print("\nTop domains to target:")
print(opportunities['linking_domain'].value_counts().head(20))

# Export for outreach
opportunities.to_excel('link_opportunities.xlsx', index=False)
print("\nExported to link_opportunities.xlsx")

Best Practices

  • Track rankings consistently at the same time daily/weekly
  • Monitor both organic and featured snippet positions
  • Use JavaScript rendering for SERP and dynamic content
  • Respect search engine rate limits and robots.txt
  • Combine automated data with manual analysis
  • Track local rankings from different geographic locations

Key SEO Metrics to Track

Rankings & Visibility

  • Keyword positions (top 3, top 10, top 100)
  • SERP features captured (featured snippets, PAA)
  • Domain authority and page authority
  • Organic traffic trends

On-Page Elements

  • Title and meta description optimization
  • Header structure (H1-H6)
  • Content depth and quality
  • Internal linking structure

Technical SEO

  • Page load speed
  • Mobile responsiveness
  • Structured data implementation
  • Canonical tags and redirects

Competitive Analysis

  • Competitor keyword rankings
  • Content gap identification
  • Backlink profile comparison
  • SERP feature ownership