diff --git a/linked_in_scraper.py b/linked_in_scraper.py index 4e63b59..95f9ab5 100644 --- a/linked_in_scraper.py +++ b/linked_in_scraper.py @@ -7,6 +7,7 @@ from bs4 import BeautifulSoup from jinja2 import Template import headers +# these represent different job functions FUNCTION_FACETS = [ 17, 18, @@ -21,6 +22,10 @@ FUNCTION_FACETS = [ ] def download_file(url, local_filename=None): + '''Downloads a file with requests + from: https://stackoverflow.com/a/16696317 + ''' + if local_filename is None: local_filename = url.split('/')[-1] @@ -35,7 +40,8 @@ def download_file(url, local_filename=None): def get_page(company_id, function_id, start=0, count=50): - # facet.FA 17 + '''Gets a single page of results from linkedin for a particular job function at a company''' + params = { 'facet': ['CC', 'FA'], 'facet.CC': company_id, @@ -49,6 +55,7 @@ def get_page(company_id, function_id, start=0, count=50): def get_company(company_id, outname): + '''Gets all employees from a company using particular job functions''' people = [] for function_id in FUNCTION_FACETS: @@ -73,6 +80,8 @@ def get_company(company_id, outname): def get_images(datafile): + '''Downloads profile images''' + with open(datafile, 'r') as infile: people = json.load(infile) @@ -100,6 +109,8 @@ def get_images(datafile): def get_profile(pid): + '''Downloads individual profiles''' + outname = 'profiles/{}.json'.format(pid) if os.path.exists(outname): return outname @@ -127,6 +138,8 @@ def get_profile(pid): def get_profiles(datafile): + '''Gets all profiles''' + with open(datafile, 'r') as infile: data = json.load(infile) @@ -136,6 +149,8 @@ def get_profiles(datafile): def clean_and_parse(datafile, outname): + '''Outputs csv, json and html from employee listings''' + out = [] with open(datafile, 'r') as infile: data = json.load(infile) @@ -158,11 +173,6 @@ def clean_and_parse(datafile, outname): 'linkedin': 'https://linkedin.com/in/' + pid, } - # profile_file = 'profiles/{}.json'.format(pid) - # if os.path.exists(profile_file): - # with open(profile_file, 'r') as profilein: - # profile = json.load(profilein) - if mid not in out: out.append(item)