You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

171 lines
4.4 KiB

import time
import json
import csv
import os
import requests
from bs4 import BeautifulSoup
import headers
def download_file(url, local_filename=None):
if local_filename is None:
local_filename = url.split('/')[-1]
print('saving to', local_filename)
r = requests.get(url, stream=True)
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
return local_filename
def get_page(company_id, start=0, count=50):
params = {
'facet': 'CC',
'facet.CC': company_id,
'count': count,
'start': start,
}
response = requests.get('https://www.linkedin.com/sales/search/results', headers=headers.headers, params=params)
return response.json()
def get_company(company_id, outname):
count = 50
start = 0
results = get_page(company_id)
total = results['pagination']['total']
people = results['searchResults']
start += count
while start < total:
print('getting', start, total)
time.sleep(1)
results = get_page(company_id, start)
people += results['searchResults']
start += count
with open(outname, 'w') as outfile:
json.dump(people, outfile, indent=2)
return outname
def get_images(datafile):
with open(datafile, 'r') as infile:
people = json.load(infile)
people = [p['member'] for p in people]
for p in people:
if 'vectorImage' not in p:
continue
pid = p['memberId']
outname = 'images/{}.jpg'.format(pid)
if os.path.exists(outname):
print('skipping')
continue
url = p['vectorImage']['rootUrl']
url += sorted(p['vectorImage']['artifacts'], key=lambda x: x['width'])[-1]['fileIdentifyingUrlPathSegment']
print(url)
download_file(url, outname)
time.sleep(1)
def get_profile(pid):
outname = 'profiles/{}.json'.format(pid)
if os.path.exists(outname):
return outname
out = {}
url = 'https://www.linkedin.com/sales/people/{},NAME_SEARCH'.format(pid)
print(url)
response = requests.get(url, headers=headers.headers)
soup = BeautifulSoup(response.text, 'html.parser')
codes = soup.select('code')
for c in codes:
try:
d = json.loads(c.text)
if 'contactInfo' in d:
out = d
break
except Exception as e:
continue
with open(outname, 'w') as outfile:
json.dump(out, outfile)
time.sleep(1)
return outname
def get_profiles(datafile):
with open(datafile, 'r') as infile:
data = json.load(infile)
for d in data:
pid = d['member']['profileId']
get_profile(pid)
def clean_and_parse(datafile, outname):
out = []
with open(datafile, 'r') as infile:
data = json.load(infile)
for d in data:
mid = d['member']['memberId']
pid = d['member']['profileId']
imgpath = 'images/{}.jpg'.format(mid)
if not os.path.exists(imgpath):
imgpath = None
item = {
'name': d['member'].get('formattedName', ''),
'title': d['member'].get('title', ''),
'img': imgpath,
'company': d['company'].get('companyName', ''),
'location': d['member'].get('location', ''),
'id': d['member']['memberId'],
'linkedin': 'https://linkedin.com/in/' + pid,
}
# profile_file = 'profiles/{}.json'.format(pid)
# if os.path.exists(profile_file):
# with open(profile_file, 'r') as profilein:
# profile = json.load(profilein)
out.append(item)
with open(outname + '.json', 'w') as jsonfile:
json.dump(out, jsonfile, indent=2)
with open(outname + '.csv', 'w') as csvfile:
fieldnames = ['name', 'title', 'location', 'url']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for row in out:
writer.writerow({
'name': row['name'],
'title': row['title'],
'location': row['location'],
'url': row['linkedin']
})
if __name__ == '__main__':
ICE = '533534'
datafile = 'ice_raw.json'
# get_company(ICE, datafile)
# get_profiles(datafile)
# get_images(datafile)
clean_and_parse(datafile, 'ice')