You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

192 lines
4.9 KiB

import time
import json
import csv
import os
import requests
from bs4 import BeautifulSoup
from jinja2 import Template
import headers
FUNCTION_FACETS = [
17,
18,
14,
2,
4,
20,
5,
13,
12,
26,
]
def download_file(url, local_filename=None):
if local_filename is None:
local_filename = url.split('/')[-1]
print('saving to', local_filename)
r = requests.get(url, stream=True)
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
return local_filename
def get_page(company_id, function_id, start=0, count=50):
# facet.FA 17
params = {
'facet': ['CC', 'FA'],
'facet.CC': company_id,
'facet.FA': function_id,
'count': count,
'start': start,
}
response = requests.get('https://www.linkedin.com/sales/search/results', headers=headers.headers, params=params)
return response.json()
def get_company(company_id, outname):
people = []
for function_id in FUNCTION_FACETS:
print('getting function', function_id, 'for company', company_id)
count = 50
start = 0
results = get_page(company_id, function_id)
total = results['pagination']['total']
people += results['searchResults']
start += count
while start < total:
print('getting', start, 'of', total)
time.sleep(1)
results = get_page(company_id, function_id, start)
people += results['searchResults']
start += count
with open(outname, 'w') as outfile:
json.dump(people, outfile, indent=2)
return outname
def get_images(datafile):
with open(datafile, 'r') as infile:
people = json.load(infile)
people = [p['member'] for p in people]
for p in people:
if 'vectorImage' not in p:
continue
pid = p['memberId']
outname = 'images/{}.jpg'.format(pid)
if os.path.exists(outname):
print('skipping')
continue
url = p['vectorImage']['rootUrl']
url += sorted(p['vectorImage']['artifacts'], key=lambda x: x['width'])[-1]['fileIdentifyingUrlPathSegment']
print(url)
download_file(url, outname)
time.sleep(1)
def get_profile(pid):
outname = 'profiles/{}.json'.format(pid)
if os.path.exists(outname):
return outname
out = {}
url = 'https://www.linkedin.com/sales/people/{},NAME_SEARCH'.format(pid)
print(url)
response = requests.get(url, headers=headers.headers)
soup = BeautifulSoup(response.text, 'html.parser')
codes = soup.select('code')
for c in codes:
try:
d = json.loads(c.text)
if 'contactInfo' in d:
out = d
break
except Exception as e:
continue
with open(outname, 'w') as outfile:
json.dump(out, outfile)
time.sleep(1)
return outname
def get_profiles(datafile):
with open(datafile, 'r') as infile:
data = json.load(infile)
for d in data:
pid = d['member']['profileId']
get_profile(pid)
def clean_and_parse(datafile, outname):
out = []
with open(datafile, 'r') as infile:
data = json.load(infile)
for d in data:
mid = d['member']['memberId']
pid = d['member']['profileId']
imgpath = 'images/{}.jpg'.format(mid)
if not os.path.exists(imgpath):
imgpath = None
item = {
'name': d['member'].get('formattedName', ''),
'title': d['member'].get('title', ''),
'img': imgpath,
'company': d['company'].get('companyName', ''),
'location': d['member'].get('location', ''),
'id': d['member']['memberId'],
'linkedin': 'https://linkedin.com/in/' + pid,
}
# profile_file = 'profiles/{}.json'.format(pid)
# if os.path.exists(profile_file):
# with open(profile_file, 'r') as profilein:
# profile = json.load(profilein)
if mid not in out:
out.append(item)
with open(outname + '.json', 'w') as jsonfile:
json.dump(out, jsonfile, indent=2)
with open(outname + '.csv', 'w') as csvfile:
fieldnames = list(out[0].keys())
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for row in out:
writer.writerow(row)
with open('template.html', 'r') as templatefile:
template = Template(templatefile.read())
html = template.render(people=out)
with open('index.html', 'w') as htmlout:
htmlout.write(html)
if __name__ == '__main__':
ICE = '533534'
datafile = 'ice_raw.json'
get_company(ICE, datafile)
get_profiles(datafile)
get_images(datafile)
clean_and_parse(datafile, 'ice')