diff --git a/scrape.js b/scrape.js index 154536b..f2e92a4 100644 --- a/scrape.js +++ b/scrape.js @@ -97,14 +97,25 @@ const scrapeInstance = async (link, page) => { const instanceLinks = await page.$$('tr td:nth-child(2) a'); for(instance of instanceLinks){ - - const linkHTML = await instance.getProperty('href'); - const href = await linkHTML.jsonValue() - const instanceID = href.match(/[0-9]*$/)[0]; - const res = await client.query(`SELECT * FROM instances WHERE instance_id = ${instanceID}`); - if(res.rowCount === 0){ - await scrapeInstance(instance, page); + const td = await instance.getProperty('parentNode'); + const tr = await td.getProperty('parentNode'); + const sibling = await tr.$('.numeric'); + const siblingHTML = await sibling.getProperty('innerHTML'); + const numStudents = parseInt(await siblingHTML.jsonValue()); + if(numStudents > 0){ + const linkHTML = await instance.getProperty('href'); + const href = await linkHTML.jsonValue() + const instanceID = href.match(/[0-9]*$/)[0]; + const res = await client.query(`SELECT * FROM instances WHERE instance_id = ${instanceID}`); + if(res.rowCount === 0){ + await scrapeInstance(instance, page); + } else { + console.log(instanceID, 'found'); + } + } else { + console.log('no students, skipping'); } + } console.log('done');