const puppeteer = require('puppeteer'); const pg = require('pg'); const brokenInstances = [8302, 8294]; const client = new pg.Client({ database:"outcomes_tracker", user:"matt", password:"matt" }) const delay = (ms) => { return new Promise((resolve) => { setTimeout(resolve, ms); }); } const scrapeInstance = async (link, page) => { await link.click(); await page.waitForNavigation(); const totalStudentsElem = await page.$('.instance-student-detail .students'); if(totalStudentsElem){ const totalStudentsHTML = await totalStudentsElem.getProperty('innerHTML'); const totalStudents = parseInt(await totalStudentsHTML.jsonValue()); const h3elem = await page.$('h3'); const h3HTML = await h3elem.getProperty('innerHTML'); const h3Value = await h3HTML.jsonValue() const course = h3Value.match(/(?<=\()[ 0-9A-Za-z\-]*(?=\))/g)[0] const droppedElem = await page.$('.instance-student-detail .dropped'); const droppedHTML = await droppedElem.getProperty('innerHTML'); const dropped = parseInt(await droppedHTML.jsonValue()) const graduatesElem = await page.$('.graduates'); const graduatesHTML = await graduatesElem.getProperty('innerHTML'); const graduates = parseInt(await graduatesHTML.jsonValue()) const seekingElem = await page.$('.job-seeking'); const seekingHTML = await seekingElem.getProperty('innerHTML'); const seeking = parseInt(await seekingHTML.jsonValue()) const outcomes90elem = await page.$('.full-time-90-days-actuals') const outcomes90HTML = await outcomes90elem.getProperty('innerHTML'); const outcomes90 = await outcomes90HTML.jsonValue(); const outcomes90Numeric = parseInt(outcomes90.split(' ')[0]) const outcomes90Percent = outcomes90.split(' ')[1].replace(/[()]/g, '') const instanceID = parseInt(page.url().match(/[0-9]*$/g)[0]) const graduationElem = await page.$('.course-header__detail') const graduationHTML = await graduationElem.getProperty('innerHTML'); const instanceHeaderText = await graduationHTML.jsonValue() const graduationDate = instanceHeaderText.split(' - ')[1].trim(); const startDate = instanceHeaderText.split(' - ')[0].trim().split(':')[1].trim(); console.log({ instanceID, course, startDate, graduationDate, totalStudents, dropped, graduates, seeking, outcomes90Numeric, outcomes90Percent }); const res = await client.query(`INSERT INTO instances (instance_id, course, start_date, graduation_date, total_students, dropped, graduates, seekers, ninety_day_outcomes) VALUES (${instanceID}, '${course}', '${startDate}', '${graduationDate}', ${totalStudents}, ${dropped}, ${graduates}, ${seeking}, ${outcomes90Numeric})`); } else { await page.screenshot({ path: 'outcomes.png' }) } return page.goBack(); } (async () => { await client.connect(); const res = await client.query('SELECT $1::text as message', ['Connected to Postgres']) console.log(res.rows[0].message) const browser = await puppeteer.launch({headless:'new'}); const page = await browser.newPage(); await page.goto('https://outcomes.generalassemb.ly/'); await page.type('input[type="email"]', 'matt.huntington@generalassemb.ly'); await page.click('button[type="submit"]'); console.log('entered email'); await page.waitForNavigation() await delay(500) //wtf await page.type('input[type="text"]', 'matt.huntington@generalassemb.ly'); await page.type('input[type="password"]', 'Hunt!ngt0n!4'); await page.click('input[type="submit"]'); console.log('entered okta creds'); await page.waitForSelector('div[data-se="okta_verify-push"] a'); await page.click('div[data-se="okta_verify-push"] a'); console.log('selected push notification'); await page.waitForNavigation(); await page.waitForSelector('#from'); console.log('logged in'); await page.type('#from', '01/01/2013'); await page.click('input[value="Filter"]'); console.log('filtering instances'); await page.waitForNavigation(); const instanceLinks = await page.$$('tr td:nth-child(2) a'); for(instance of instanceLinks){ const td = await instance.getProperty('parentNode'); const tr = await td.getProperty('parentNode'); const sibling = await tr.$('.numeric'); const siblingHTML = await sibling.getProperty('innerHTML'); const numStudents = parseInt(await siblingHTML.jsonValue()); if(numStudents > 0){ const linkHTML = await instance.getProperty('href'); const href = await linkHTML.jsonValue() const instanceID = href.match(/[0-9]*$/)[0]; const res = await client.query(`SELECT * FROM instances WHERE instance_id = ${instanceID}`); if(res.rowCount === 0){ if(!brokenInstances.includes(parseInt(instanceID))){ await scrapeInstance(instance, page); } } else { console.log(instanceID, 'found'); } } else { console.log('no students, skipping'); } } console.log('done'); //await page.screenshot({ path: 'outcomes.png' }) await browser.close(); await client.end(); })();