You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

204 lines
5.1 KiB

import time
import json
import csv
import os
import requests
from bs4 import BeautifulSoup
from jinja2 import Template
import headers
# these represent different job functions
FUNCTION_FACETS = [
17,
18,
14,
2,
4,
20,
5,
13,
12,
26,
]
def download_file(url, local_filename=None):
'''Downloads a file with requests
from: https://stackoverflow.com/a/16696317
'''
if local_filename is None:
local_filename = url.split('/')[-1]
print('saving to', local_filename)
r = requests.get(url, stream=True)
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
return local_filename
def get_page(company_id, function_id, start=0, count=50):
'''Gets a single page of results from linkedin for a particular job function at a company'''
params = {
'facet': ['CC', 'FA'],
'facet.CC': company_id,
'facet.FA': function_id,
'count': count,
'start': start,
}
response = requests.get('https://www.linkedin.com/sales/search/results', headers=headers.headers, params=params)
return response.json()
def get_company(company_id, outname):
'''Gets all employees from a company using particular job functions'''
people = []
for function_id in FUNCTION_FACETS:
print('getting function', function_id, 'for company', company_id)
count = 50
start = 0
results = get_page(company_id, function_id)
total = results['pagination']['total']
people += results['searchResults']
start += count
while start < total:
print('getting', start, 'of', total)
time.sleep(1)
results = get_page(company_id, function_id, start)
people += results['searchResults']
start += count
with open(outname, 'w') as outfile:
json.dump(people, outfile, indent=2)
return outname
def get_images(datafile):
'''Downloads profile images'''
with open(datafile, 'r') as infile:
people = json.load(infile)
people = [p['member'] for p in people]
for p in people:
if 'vectorImage' not in p:
continue
pid = p['memberId']
outname = 'images/{}.jpg'.format(pid)
if os.path.exists(outname):
print('skipping')
continue
url = p['vectorImage']['rootUrl']
url += sorted(p['vectorImage']['artifacts'], key=lambda x: x['width'])[-1]['fileIdentifyingUrlPathSegment']
print(url)
download_file(url, outname)
time.sleep(1)
def get_profile(pid):
'''Downloads individual profiles'''
outname = 'profiles/{}.json'.format(pid)
if os.path.exists(outname):
return outname
out = {}
url = 'https://www.linkedin.com/sales/people/{},NAME_SEARCH'.format(pid)
print(url)
response = requests.get(url, headers=headers.headers)
soup = BeautifulSoup(response.text, 'html.parser')
codes = soup.select('code')
for c in codes:
try:
d = json.loads(c.text)
if 'contactInfo' in d:
out = d
break
except Exception as e:
continue
with open(outname, 'w') as outfile:
json.dump(out, outfile)
time.sleep(1)
return outname
def get_profiles(datafile):
'''Gets all profiles'''
with open(datafile, 'r') as infile:
data = json.load(infile)
for d in data:
pid = d['member']['profileId']
get_profile(pid)
def clean_and_parse(datafile, outname):
'''Outputs csv, json and html from employee listings'''
out = []
mids = []
with open(datafile, 'r') as infile:
data = json.load(infile)
for d in data:
mid = d['member']['memberId']
pid = d['member']['profileId']
imgpath = 'images/{}.jpg'.format(mid)
if not os.path.exists(imgpath):
imgpath = None
item = {
'name': d['member'].get('formattedName', ''),
'title': d['member'].get('title', ''),
'img': imgpath,
'company': d['company'].get('companyName', ''),
'location': d['member'].get('location', ''),
'id': d['member']['memberId'],
'linkedin': 'https://linkedin.com/in/' + pid,
}
if mid not in mids:
out.append(item)
mids.append(mid)
with open(outname + '.json', 'w') as jsonfile:
json.dump(out, jsonfile, indent=2)
with open(outname + '.csv', 'w') as csvfile:
fieldnames = list(out[0].keys())
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for row in out:
writer.writerow(row)
with open('template.html', 'r') as templatefile:
template = Template(templatefile.read())
html = template.render(people=out)
with open('index.html', 'w') as htmlout:
htmlout.write(html)
if __name__ == '__main__':
ICE = '533534'
datafile = 'ice_raw.json'
get_company(ICE, datafile)
get_profiles(datafile)
get_images(datafile)
clean_and_parse(datafile, 'ice')