import time
|
|
import json
|
|
import csv
|
|
import os
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from jinja2 import Template
|
|
import headers
|
|
|
|
# these represent different job functions
|
|
FUNCTION_FACETS = [17, 18, 14, 2, 4, 20, 5, 13, 12, 26] #FA
|
|
SENIORITY_FACETS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] #SE
|
|
LOCATION_FACETS = [ #G
|
|
'us:8-2-0-1-2',
|
|
'us:97',
|
|
'us:va',
|
|
'us:dc',
|
|
'us:tx',
|
|
'us:ca',
|
|
'us:md',
|
|
'us:70',
|
|
'us:31',
|
|
'us:ny',
|
|
'us:8-8-0-8-1',
|
|
'us:8-8-0-3-1',
|
|
'us:ga',
|
|
'us:52',
|
|
'us:7',
|
|
'us:8-8-0-95-11',
|
|
'us:nj',
|
|
'us:3-2-0-31-1',
|
|
]
|
|
|
|
FACETS = [
|
|
('FA', FUNCTION_FACETS),
|
|
('SE', SENIORITY_FACETS),
|
|
('G', LOCATION_FACETS)
|
|
]
|
|
|
|
def download_file(url, local_filename=None):
|
|
'''Downloads a file with requests
|
|
from: https://stackoverflow.com/a/16696317
|
|
'''
|
|
|
|
if local_filename is None:
|
|
local_filename = url.split('/')[-1]
|
|
|
|
print('saving to', local_filename)
|
|
r = requests.get(url, stream=True)
|
|
with open(local_filename, 'wb') as f:
|
|
for chunk in r.iter_content(chunk_size=1024):
|
|
if chunk:
|
|
f.write(chunk)
|
|
|
|
return local_filename
|
|
|
|
|
|
def get_page(company_id, facet=None, facet_id=None, start=0, count=50):
|
|
'''Gets a single page of results from linkedin for a particular job function at a company'''
|
|
|
|
params = {
|
|
'facet': ['CC'],
|
|
'facet.CC': company_id,
|
|
'count': count,
|
|
'start': start,
|
|
}
|
|
|
|
if facet is not None and facet_id is not None:
|
|
params['facet'] = ['CC', facet]
|
|
params['facet.' + facet] = facet_id
|
|
|
|
response = requests.get('https://www.linkedin.com/sales/search/results', headers=headers.headers, params=params)
|
|
return response.json()
|
|
|
|
|
|
def get_company(company_id, outname):
|
|
'''Gets all employees from a company using particular job functions'''
|
|
people = []
|
|
|
|
for facet, facet_ids in FACETS:
|
|
for facet_id in facet_ids:
|
|
print('getting facet', facet, facet_id, 'for company', company_id)
|
|
count = 50
|
|
start = 0
|
|
results = get_page(company_id, facet, facet_id)
|
|
total = results['pagination']['total']
|
|
people += results['searchResults']
|
|
start += count
|
|
while start < total:
|
|
print('getting', start, 'of', total)
|
|
time.sleep(1)
|
|
results = get_page(company_id, facet, facet_id, start)
|
|
people += results['searchResults']
|
|
start += count
|
|
|
|
with open(outname, 'w') as outfile:
|
|
json.dump(people, outfile, indent=2)
|
|
|
|
return outname
|
|
|
|
|
|
def get_images(datafile):
|
|
'''Downloads profile images'''
|
|
|
|
with open(datafile, 'r') as infile:
|
|
people = json.load(infile)
|
|
|
|
people = [p['member'] for p in people]
|
|
|
|
for p in people:
|
|
if 'vectorImage' not in p:
|
|
continue
|
|
|
|
pid = p['memberId']
|
|
outname = 'images/{}.jpg'.format(pid)
|
|
|
|
if os.path.exists(outname):
|
|
print('skipping')
|
|
continue
|
|
|
|
url = p['vectorImage']['rootUrl']
|
|
url += sorted(p['vectorImage']['artifacts'], key=lambda x: x['width'])[-1]['fileIdentifyingUrlPathSegment']
|
|
|
|
print(url)
|
|
|
|
download_file(url, outname)
|
|
|
|
time.sleep(1)
|
|
|
|
|
|
def get_profile(pid):
|
|
'''Downloads individual profiles'''
|
|
|
|
outname = 'profiles/{}.json'.format(pid)
|
|
if os.path.exists(outname):
|
|
return outname
|
|
|
|
out = {}
|
|
url = 'https://www.linkedin.com/sales/people/{},NAME_SEARCH'.format(pid)
|
|
print(url)
|
|
response = requests.get(url, headers=headers.headers)
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
codes = soup.select('code')
|
|
for c in codes:
|
|
try:
|
|
d = json.loads(c.text)
|
|
if 'contactInfo' in d:
|
|
out = d
|
|
break
|
|
except Exception as e:
|
|
continue
|
|
|
|
with open(outname, 'w') as outfile:
|
|
json.dump(out, outfile)
|
|
|
|
time.sleep(1)
|
|
return outname
|
|
|
|
|
|
def get_profiles(datafile):
|
|
'''Gets all profiles'''
|
|
|
|
with open(datafile, 'r') as infile:
|
|
data = json.load(infile)
|
|
|
|
for d in data:
|
|
pid = d['member']['profileId']
|
|
get_profile(pid)
|
|
|
|
|
|
def clean_and_parse(datafile, outname):
|
|
'''Outputs csv, json and html from employee listings'''
|
|
|
|
out = []
|
|
mids = []
|
|
with open(datafile, 'r') as infile:
|
|
data = json.load(infile)
|
|
|
|
for d in data:
|
|
mid = d['member']['memberId']
|
|
pid = d['member']['profileId']
|
|
|
|
imgpath = 'images/{}.jpg'.format(mid)
|
|
if not os.path.exists(imgpath):
|
|
imgpath = None
|
|
|
|
item = {
|
|
'name': d['member'].get('formattedName', ''),
|
|
'title': d['member'].get('title', ''),
|
|
'img': imgpath,
|
|
'company': d['company'].get('companyName', ''),
|
|
'location': d['member'].get('location', ''),
|
|
'id': d['member']['memberId'],
|
|
'linkedin': 'https://linkedin.com/in/' + pid,
|
|
}
|
|
|
|
if mid not in mids:
|
|
out.append(item)
|
|
mids.append(mid)
|
|
|
|
with open(outname + '.json', 'w') as jsonfile:
|
|
json.dump(out, jsonfile, indent=2)
|
|
|
|
with open(outname + '.csv', 'w') as csvfile:
|
|
fieldnames = list(out[0].keys())
|
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
|
writer.writeheader()
|
|
for row in out:
|
|
writer.writerow(row)
|
|
|
|
with open('template.html', 'r') as templatefile:
|
|
template = Template(templatefile.read())
|
|
html = template.render(people=out)
|
|
with open('index.html', 'w') as htmlout:
|
|
htmlout.write(html)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
ICE = '533534'
|
|
datafile = 'ice_raw.json'
|
|
get_company(ICE, datafile)
|
|
get_profiles(datafile)
|
|
get_images(datafile)
|
|
clean_and_parse(datafile, 'ice')
|