import time import json import csv import os import requests from bs4 import BeautifulSoup import headers def download_file(url, local_filename=None): if local_filename is None: local_filename = url.split('/')[-1] print('saving to', local_filename) r = requests.get(url, stream=True) with open(local_filename, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): if chunk: f.write(chunk) return local_filename def get_page(company_id, start=0, count=50): params = { 'facet': 'CC', 'facet.CC': company_id, 'count': count, 'start': start, } response = requests.get('https://www.linkedin.com/sales/search/results', headers=headers.headers, params=params) return response.json() def get_company(company_id, outname): count = 50 start = 0 results = get_page(company_id) total = results['pagination']['total'] people = results['searchResults'] start += count while start < total: print('getting', start, total) time.sleep(1) results = get_page(company_id, start) people += results['searchResults'] start += count with open(outname, 'w') as outfile: json.dump(people, outfile, indent=2) return outname def get_images(datafile): with open(datafile, 'r') as infile: people = json.load(infile) people = [p['member'] for p in people] for p in people: if 'vectorImage' not in p: continue pid = p['memberId'] outname = 'images/{}.jpg'.format(pid) if os.path.exists(outname): print('skipping') continue url = p['vectorImage']['rootUrl'] url += sorted(p['vectorImage']['artifacts'], key=lambda x: x['width'])[-1]['fileIdentifyingUrlPathSegment'] print(url) download_file(url, outname) time.sleep(1) def get_profile(pid): outname = 'profiles/{}.json'.format(pid) if os.path.exists(outname): return outname out = {} url = 'https://www.linkedin.com/sales/people/{},NAME_SEARCH'.format(pid) print(url) response = requests.get(url, headers=headers.headers) soup = BeautifulSoup(response.text, 'html.parser') codes = soup.select('code') for c in codes: try: d = json.loads(c.text) if 'contactInfo' in d: out = d break except Exception as e: continue with open(outname, 'w') as outfile: json.dump(out, outfile) time.sleep(1) return outname def get_profiles(datafile): with open(datafile, 'r') as infile: data = json.load(infile) for d in data: pid = d['member']['profileId'] get_profile(pid) def clean_and_parse(datafile, outname): out = [] with open(datafile, 'r') as infile: data = json.load(infile) for d in data: mid = d['member']['memberId'] pid = d['member']['profileId'] imgpath = 'images/{}.jpg'.format(mid) if not os.path.exists(imgpath): imgpath = None item = { 'name': d['member'].get('formattedName', ''), 'title': d['member'].get('title', ''), 'img': imgpath, 'company': d['company'].get('companyName', ''), 'location': d['member'].get('location', ''), 'id': d['member']['memberId'], 'linkedin': 'https://linkedin.com/in/' + pid, } # profile_file = 'profiles/{}.json'.format(pid) # if os.path.exists(profile_file): # with open(profile_file, 'r') as profilein: # profile = json.load(profilein) out.append(item) with open(outname + '.json', 'w') as jsonfile: json.dump(out, jsonfile, indent=2) with open(outname + '.csv', 'w') as csvfile: fieldnames = ['name', 'title', 'location', 'url'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for row in out: writer.writerow({ 'name': row['name'], 'title': row['title'], 'location': row['location'], 'url': row['linkedin'] }) if __name__ == '__main__': ICE = '533534' datafile = 'ice_raw.json' # get_company(ICE, datafile) # get_profiles(datafile) # get_images(datafile) clean_and_parse(datafile, 'ice')