Playwright scraper

Besides Playwright this script uses a ‘Webserver’ called Express. Install it with command ‘npm i express’ in the terminal.

Steps:

create script (crawl.js) with Express and Playwright
open terminal and start webserver with ‘node crawl’
go to localhost:1234 in browser
add parameter to url : ‘localhost:1234/crawl?website=https://domaintoscan.com’
scope the scraping by filtering on url or just scrape all pages from the defined domain
kick off the scraping action by just hitting enter after adding the url-parameter
if you have headeless:false in the script you shoud see a browser opening
the script will start scraping defined elements(seo-title, meta-data, headings, images, links)
in the terminal window results of the scrape should be logged, so you can follow what has been found
Playwright will add links found to the queue.
Every url will be scraped
Urls which do not start with the domainname used in the parameter are excluded from scrapeing
Urls ending with ‘.pdf’ will be excluded
Urls with a ’#’, ’@’, ’?’ and ‘tel:’ will be excluded

create crawl.js. This script starts the webserver and scrapes data from urls.

// Go to localhost:1234/crawl?website=
// Add the url you want to scrape as parameter
// http://localhost:1234/crawl?website=https://www.essent.nl 
// http://localhost:1234/crawl?website=http://books.toscrape.com/


const playwright = require('playwright-chromium');
const express = require('express');
const app = express();
const port = 1234;

app.get("/crawl", async(req, res, next) => {
    const website = req.query.website
    if (!website) {
        const err = new Error("required parameter missing");
        err.status = 400
        next(err);
    }

    try {

        var startTime = new Date();
        console.log('Start time', startTime)

        const browser = await playwright.chromium.launch({ headless: true });
        // const browser = await playwright.chromium.launch({ headless: false, slowMo: 250 });
        const context = await browser.newContext({
            // viewport: { width: 1240, height: 800 },
            // deviceScaleFactor: 1,
            // recordVideo: { dir: 'videos/' }
        });
        // await context.addCookies([...cookiesArr])

        const registry = {};
        let queue = [website]
            // const addCount = 0
        let addCount = 0

        while (queue.length > 0) {
            const url = queue[queue.length - 1];
            console.log("Current url:", url)
            const page = await context.newPage();
            // const page = await browser.newPage();
            await page.goto(url);

            registry[url] = []

            try {
                addCount = addCount += 1
                console.log('COUNTER', addCount)
                registry[url].push({
                    'url': url,
                    'id': addCount
                });
                // return addCount;

            } catch {

            }

            // registry[url] = await page.content()   // Does work, gets whole content of the page.

            try {

                const htmlTitle = await (await page.$('title').textContent().trim());
                console.log('Htmltitle:', htmlTitle)
                if (htmlTitle.length > 0)
                    registry[url].push({
                        'metaElement': 'html-seo-title',
                        'pageTitle': htmlTitle || 'Empty page title'
                    })
                else {
                    registry[url].push({
                        'metaElement': 'html-seo-title',
                        'pageTitle': 'Empty page title'
                    })
                }
            } catch (error) {
                console.log('html-title not found')
                registry[url].push({
                    'metaElement': 'html-seo-title',
                    'pageTitle': 'No htmlTitle'
                })
            }

            // Check Canonical url
            try {
                const canon = await page.$$("link[rel='canonical']")

                for (i = 0; i < canon.length; i++) {

                    const canonUrl = await canon[i].getAttribute('href')
                    if (canonUrl) {
                        console.log('canonical url:', canonUrl)
                        registry[url].push({
                            'metaElement': 'canonical-url',
                            'id': i,
                            'canonicalUrl': canonUrl
                        })
                    } else {
                        console.log('canonical empty / not found')
                        registry[url].push({
                            'metaElement': 'canonical-url',
                            'id': 'i',
                            'canonicalUrl': 'there is no canonical url in the html - no canonurl'
                        })
                    }

                }

            } catch {
                console.log('canonical not found')
                registry[url].push({
                    'metaElement': 'canonical-url',
                    'canonicalUrl': 'there is no canonical url in the html'
                })
            }
            // End Canonical url
            ////////// start meta-data
            try {

                const metas = await page.$$('meta')
                    // console.log('Metas:', metas)
                for (i = 0; i < metas.length; i++) {

                    const metaType = await metas[i].getAttribute('name');
                    const metaContent = await metas[i].getAttribute('content');
                    const metaProp = await metas[i].getAttribute('property');

                    // console.log('metas:', metaType + ' ' + metaContent + ' ' + metaProp)

                    registry[url].push({
                        'metaElement': metaType,
                        'id': i,
                        'metaElementContent': metaContent,
                        'metaElementProperty': metaProp
                    })
                }
            } catch {
                registry[url].push({
                    'metaElement': 'No metaType',
                    'id': 'i',
                    'metaElementContent': 'NometaContent',
                    'metaElementProperty': 'NometaProp'
                })
            }
            // End Meta
            // start Headings
            try {
                // registry[url] = await page.locator("H3").textContent(); // Works

                const headIngs = await page.$$("h1, h2, h3, h4, h5, h6");
                // headingDetails = []
                // console.log('Headings:', headIngs)

                for (let i = 0; i < headIngs.length; i++) {

                    const elementType = await headIngs[i].evaluate(e => e.tagName);
                    const typeElement = elementType.toLowerCase();
                    const headingTxt = await headIngs[i].textContent();
                    const headingTxtR = headingTxt.replace(/\s/g, ' ').trim()

                    // console.log('Heading:', headingTxtR)

                    registry[url].push({
                        'metaElement': 'heading',
                        'id': i,
                        'type': typeElement,
                        'headingTxt': headingTxtR,
                    });
                }
            } catch {

                registry[url] = 'No data'
            }
            // End Heading scrape
            // Start scraping images
            try {

                const images = await page.$$('img')
                    // console.log('Image:', images)
                    // allImages = []

                for (i = 0; i < images.length; i++) {
                    let imageItemSrc = await images[i].getAttribute('src')
                    let imageItemAlt = await images[i].getAttribute('alt')
                    let imageItemLazy = await images[i].getAttribute('lazy')

                    // let imageItemSrcTextContent = await images[i].getAttribute('src').textContent();

                    // console.log('Img:', imageItemSrcTextContent)

                    registry[url].push({
                        'metaElement': 'image',
                        'id': i,
                        'imageSource': imageItemSrc,
                        'imageAlt': imageItemAlt || 'No alt',
                        'imageLazyLoaded': imageItemLazy || 'Not lazy loaded'
                    })
                }

            } catch {
                console.log('No images on the page')
            }
            // End image scrape
            ///Start scraping link elements
            try {

                const urlHrefs = await page.$$('a, button');
                // console.log('urlHrefs:', urlHrefs)
                // linkDetails = []

                for (let i = 0; i < urlHrefs.length; i++) {
                    const elementType = await urlHrefs[i].evaluate(e => e.tagName);
                    const typeElement = elementType.toLowerCase();
                    const type = 'link'
                    const href = await urlHrefs[i].getAttribute('href');
                    const hreftarget = await urlHrefs[i].getAttribute('target');
                    const hrefrel = await urlHrefs[i].getAttribute('rel');

                    if (!href) {
                        href2 = '/#'
                    } else {
                        href2 = href
                    }
                    const linkTxt = await urlHrefs[i].textContent();
                    const linkTxtR = linkTxt.replace(/\s/g, ' ').trim()

                    // console.log('LinkTxt:', linkTxt)

                    registry[url].push({
                        'metaElement': type,
                        'type': typeElement,
                        'id': i,
                        'linkTxt': linkTxtR,
                        'linkUrl': href2,
                        'hrefTarget': hreftarget,
                        'hrefRel': hrefrel
                    });
                }

            } catch {
                console.log('No links on the page')
            }
            // End scrape links

            queue.pop();
            console.log("queue lenght", queue.length)

            const hrefs = await page.$$eval('a', (anchors) => anchors.map((link) => (link).href));
            console.log('HREFS ', hrefs)
                // console.log(typeof hrefs) // = object

            //// Specify a filter. Use startsWith  (website) for scraping all urls found. 
            //// Use ('https://domain.com/url/filter/folder') to filter on urls with specific path.
            //// Scope the scraping to whole domain or specific folders:
            const filteredHrefs = hrefs.filter(
                    // (href) => href.startsWith('http://books.toscrape.com/catalogue/category/books/') && registry[href] === undefined && !href.endsWith('.pdf') && !href.includes('#') && !href.includes('?') && !href.includes('@') && !href.includes('tel:') && !href.includes('.ashx'))
                    (href) => href.startsWith('https://../portfolio/') && registry[href] === undefined && !href.endsWith('.pdf') && !href.includes('#') && !href.includes('?') && !href.includes('@') && !href.includes('tel:') && !href.includes('.ashx'))
                // (href) => href.startsWith('https://www..nl/') && registry[href] === undefined && !href.endsWith('.pdf') && !href.includes('#') && !href.includes('?') && !href.includes('@') && !href.includes('tel:') && !href.includes('.ashx'))
                // (href) => href.startsWith('https://www..nl/') && registry[href] === undefined && !href.endsWith('.pdf') && !href.includes('#') && !href.includes('?') && !href.includes('@') && !href.includes('tel:'))
                // (href) => href.startsWith('https://.com/') && registry[href] === undefined && !href.endsWith('.pdf') && !href.includes('#') && !href.includes('?'))
                // (href) => href.startsWith('https://.app/portfolio/') && registry[href] === undefined && !href.endsWith('.pdf') && !href.includes('#') && !href.includes('?'))
                // hrefs.every(str => str.startsWith(website)));



            // (href) => href.every(str => str.startsWith(website)) && registry[href] === undefined)
            const uniqueHrefs = [...new Set(filteredHrefs)]

            queue.push(...uniqueHrefs)
            queue = [...new Set(queue)];

        }
        var end = new Date() - startTime
            // console.info('Execution time: %dms', end)
        const secondsDuration = end / 1000

        console.info(` Scraped: ${addCount} pages in ${secondsDuration} seconds....`)

        return res.status(200).send(registry)

    } catch (e) {
        console.log(e);
        res.status(500).send("Something broke")
    }

    await page.close()
    await browser.close()

})

app.listen(port, () => {
    console.log(`app running on Port: ${port}`)
})

SEO audit with Playwright

The script evaluates the:

page title
meta data
headings H1 to H6
images
links

Use the results to check if relevant technical SEO data is available. For images if Alt tags are solid, if Lazy loading is set. Check if links have a target and nofollow tag.

Steps:
Create a folder in root directory: ‘/html’
Create an index.html in the /html directory, add following Html:

<!DOCTYPE html>
<html lang="en">

<head>
    <meta charset="UTF-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Result of website crawl</title>
    <script src="https://cdn.tailwindcss.com"></script>
</head>

<body>
    <ul id="all"></ul>
    <script>
        async function loadNames() {
            const response = await fetch('toscrape.json');
            const Jsondata = await response.json();

            const arr = Object.keys(Jsondata).map((key) => [key, Jsondata[key]]);
            console.log('ARR', arr)
            for (let i = 0; i < arr.length; i++) {

                const list = document.getElementById('all')

                var resultUrl = arr[i][1].filter(obj => {
                    return obj.url

                })
                console.log('resultUrl i', resultUrl)
                for (let b = 0; b < resultUrl.length; b++) {
                    const urlItem = document.createElement('div')

                    urlHref = (JSON.stringify(resultUrl[b]))
                    urlItem.classList = 'bg-blue-800 text-white px-2 py-4'

                    urlItem.innerText = urlHref

                    list.appendChild(urlItem)

                    const metaList = document.createElement('ul')
                    urlItem.append(metaList)

                    var resultMeta = arr[i][1].filter(obj => {
                        return obj.metaElement
                    })
                    console.log('RESULT', resultMeta)
                    for (let a = 0; a < resultMeta.length; a++) {
                        const listItem = document.createElement('li')
                        listItem.classList = 'odd:bg-white even:bg-slate-200 py-1 my-1 px-4 text-black'
                        listItem.innerHTML = (JSON.stringify(resultMeta[a]))
                        metaList.appendChild(listItem)
                    }
                } // End For ResultUrl 
            }
        }
        loadNames();
    </script>
</body>
</html>

Create ‘toscrape.json’ in the ‘/html’ directory.
Run and wait for the crawl to finish. The output will be json formatted code.
The output is visible after running ‘localhost:1234/crawl?website=https://domaintoscan.com’ and waiting for the scrape to be finished.
Copy the output in this json file
Open index.html with Live server
The index.html will fetch the json data and present the data in structured way
Results of the scanned website and webpages will be visible.

Partial json…

{
  "url": "http://books.toscrape.com/",
  "id": 1
},
{
  "metaElement": "html-seo-title",
  "pageTitle": "\n    All products | Books to Scrape - Sandbox\n"
},
{
  "metaElement": "heading",
  "id": 0,
  "type": "h1",
  "headingTxt": "All products"
},
    {
  "metaElement": "image",
  "id": 0,
  "imageSource": "media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg",
  "imageAlt": "A Light in the Attic",
  "imageLazyLoaded": "Not lazy loaded"
},
{
  "metaElement": "link",
  "type": "a",
  "id": 2,
  "linkTxt": "Books",
  "linkUrl": "catalogue/category/books_1/index.html",
  "hrefTarget": null,
  "hrefRel": null
}

Result webscraper for SEO

removed some elements…

In the script

Books to scrape, scoped to ’http://books.toscrape.com/catalogue/category/books/’
Scraped: 93 pages in 166.077 seconds…

My portfolio
Scraped: 37 pages in 41.154 seconds…

Notes / to do’s

Better errorhandling. When scraping an element does not succeed values from other pages might be used in the results.
Some urls end up twice in the queue and in results, caused by urls ending with a slash or not.
Sometimes the crawler stops after first page, retry and test again. Add or remove trailing slash from start-url, add or remove www from start-url, might give better results.
This is a webcrawler. Pages which can be found by navigating the website are included. Urls which are not linked from the website (campaign or landingpages for example) are not included in the crawl.
Add blocking of images loading and block third party javascript to speed up the crawl.
Use express to create static folder, use it to host index.html.
Based on: https://www.youtube.com/watch?v=68EO2nT5jYo

Next: Scroll a page with Devtools Protocol

Previous: Screenshot settings in Playwright