er1n
/
ice-linkedin

import timeimport jsonimport csvimport osimport requestsfrom bs4 import BeautifulSoupfrom jinja2 import Templateimport headers
# these represent different job functionsFUNCTION_FACETS = [17, 18, 14, 2, 4, 20, 5, 13, 12, 26] #FASENIORITY_FACETS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] #SELOCATION_FACETS = [ #G    'us:8-2-0-1-2',    'us:97',    'us:va',    'us:dc',    'us:tx',    'us:ca',    'us:md',    'us:70',    'us:31',    'us:ny',    'us:8-8-0-8-1',    'us:8-8-0-3-1',    'us:ga',    'us:52',    'us:7',    'us:8-8-0-95-11',    'us:nj',    'us:3-2-0-31-1',]
FACETS = [    ('FA', FUNCTION_FACETS),    ('SE', SENIORITY_FACETS),    ('G', LOCATION_FACETS)]
def download_file(url, local_filename=None):    '''Downloads a file with requests
    from: https://stackoverflow.com/a/16696317    '''

    if local_filename is None:        local_filename = url.split('/')[-1]
    print('saving to', local_filename)    r = requests.get(url, stream=True)    with open(local_filename, 'wb') as f:        for chunk in r.iter_content(chunk_size=1024):            if chunk:                f.write(chunk)
    return local_filename

def get_page(company_id, facet=None, facet_id=None, start=0, count=50):    '''Gets a single page of results from linkedin for a particular job function at a company'''
    params = {        'facet': ['CC'],        'facet.CC': company_id,        'count': count,        'start': start,    }
    if facet is not None and facet_id is not None:        params['facet'] = ['CC', facet]        params['facet.' + facet] = facet_id
    response = requests.get('https://www.linkedin.com/sales/search/results', headers=headers.headers, params=params)    return response.json()

def get_company(company_id, outname):    '''Gets all employees from a company using particular job functions'''    people = []
    for facet, facet_ids in FACETS:        for facet_id in facet_ids:            print('getting facet', facet, facet_id, 'for company', company_id)            count = 50            start = 0            results = get_page(company_id, facet, facet_id)            total = results['pagination']['total']            people += results['searchResults']            start += count            while start < total:                print('getting', start, 'of', total)                time.sleep(1)                results = get_page(company_id, facet, facet_id, start)                people += results['searchResults']                start += count
                with open(outname, 'w') as outfile:                    json.dump(people, outfile, indent=2)
    return outname

def get_images(datafile):    '''Downloads profile images'''
    with open(datafile, 'r') as infile:        people = json.load(infile)
    people = [p['member'] for p in people]
    for p in people:        if 'vectorImage' not in p:            continue
        pid = p['memberId']        outname = 'images/{}.jpg'.format(pid)
        if os.path.exists(outname):            print('skipping')            continue
        url = p['vectorImage']['rootUrl']        url += sorted(p['vectorImage']['artifacts'], key=lambda x: x['width'])[-1]['fileIdentifyingUrlPathSegment']
        print(url)
        download_file(url, outname)
        time.sleep(1)

def get_profile(pid):    '''Downloads individual profiles'''
    outname = 'profiles/{}.json'.format(pid)    if os.path.exists(outname):        return outname
    out = {}    url = 'https://www.linkedin.com/sales/people/{},NAME_SEARCH'.format(pid)    print(url)    response = requests.get(url, headers=headers.headers)    soup = BeautifulSoup(response.text, 'html.parser')    codes = soup.select('code')    for c in codes:        try:            d = json.loads(c.text)            if 'contactInfo' in d:                out = d                break        except Exception as e:            continue
    with open(outname, 'w') as outfile:        json.dump(out, outfile)
    time.sleep(1)    return outname

def get_profiles(datafile):    '''Gets all profiles'''
    with open(datafile, 'r') as infile:        data = json.load(infile)
    for d in data:        pid = d['member']['profileId']        get_profile(pid)

def clean_and_parse(datafile, outname):    '''Outputs csv, json and html from employee listings'''
    out = []    mids = []    with open(datafile, 'r') as infile:        data = json.load(infile)
    for d in data:        mid = d['member']['memberId']        pid = d['member']['profileId']
        imgpath = 'images/{}.jpg'.format(mid)        if not os.path.exists(imgpath):            imgpath = None
        item = {            'name': d['member'].get('formattedName', ''),            'title': d['member'].get('title', ''),            'img': imgpath,            'company': d['company'].get('companyName', ''),            'location': d['member'].get('location', ''),            'id': d['member']['memberId'],            'linkedin': 'https://linkedin.com/in/' + pid,        }
        if mid not in mids:            out.append(item)            mids.append(mid)
    with open(outname + '.json', 'w') as jsonfile:        json.dump(out, jsonfile, indent=2)
    with open(outname + '.csv', 'w') as csvfile:        fieldnames = list(out[0].keys())        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)        writer.writeheader()        for row in out:            writer.writerow(row)
    with open('template.html', 'r') as templatefile:        template = Template(templatefile.read())    html = template.render(people=out)    with open('index.html', 'w') as htmlout:        htmlout.write(html)

if __name__ == '__main__':    ICE = '533534'    datafile = 'ice_raw.json'    get_company(ICE, datafile)    get_profiles(datafile)    get_images(datafile)    clean_and_parse(datafile, 'ice')