clean up scraper

7 years ago · 58f4cb12c8
--- a/linked_in_scraper.py
+++ b/linked_in_scraper.py
@ -7,6 +7,7 @@ from bs4 import BeautifulSoup
 from jinja2 import Template
 import headers

 # these represent different job functions
 FUNCTION_FACETS = [
    17,
    18,
@ -21,6 +22,10 @@ FUNCTION_FACETS = [
 ]

 def download_file(url, local_filename=None):
    '''Downloads a file with requests
    from: https://stackoverflow.com/a/16696317
    '''

    if local_filename is None:
        local_filename = url.split('/')[-1]

@ -35,7 +40,8 @@ def download_file(url, local_filename=None):


 def get_page(company_id, function_id, start=0, count=50):
    # facet.FA	17
    '''Gets a single page of results from linkedin for a particular job function at a company'''

    params = {
        'facet': ['CC', 'FA'],
        'facet.CC': company_id,
@ -49,6 +55,7 @@ def get_page(company_id, function_id, start=0, count=50):


 def get_company(company_id, outname):
    '''Gets all employees from a company using particular job functions'''
    people = []

    for function_id in FUNCTION_FACETS:
@ -73,6 +80,8 @@ def get_company(company_id, outname):


 def get_images(datafile):
    '''Downloads profile images'''

    with open(datafile, 'r') as infile:
        people = json.load(infile)

@ -100,6 +109,8 @@ def get_images(datafile):


 def get_profile(pid):
    '''Downloads individual profiles'''

    outname = 'profiles/{}.json'.format(pid)
    if os.path.exists(outname):
        return outname
@ -127,6 +138,8 @@ def get_profile(pid):


 def get_profiles(datafile):
    '''Gets all profiles'''

    with open(datafile, 'r') as infile:
        data = json.load(infile)

@ -136,6 +149,8 @@ def get_profiles(datafile):


 def clean_and_parse(datafile, outname):
    '''Outputs csv, json and html from employee listings'''

    out = []
    with open(datafile, 'r') as infile:
        data = json.load(infile)
@ -158,11 +173,6 @@ def clean_and_parse(datafile, outname):
            'linkedin': 'https://linkedin.com/in/' + pid,
        }

        # profile_file = 'profiles/{}.json'.format(pid)
        # if os.path.exists(profile_file):
        #     with open(profile_file, 'r') as profilein:
        #         profile = json.load(profilein)

        if mid not in out:
            out.append(item)