You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

204 lines
5.1 KiB

7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
  1. import time
  2. import json
  3. import csv
  4. import os
  5. import requests
  6. from bs4 import BeautifulSoup
  7. from jinja2 import Template
  8. import headers
  9. # these represent different job functions
  10. FUNCTION_FACETS = [
  11. 17,
  12. 18,
  13. 14,
  14. 2,
  15. 4,
  16. 20,
  17. 5,
  18. 13,
  19. 12,
  20. 26,
  21. ]
  22. def download_file(url, local_filename=None):
  23. '''Downloads a file with requests
  24. from: https://stackoverflow.com/a/16696317
  25. '''
  26. if local_filename is None:
  27. local_filename = url.split('/')[-1]
  28. print('saving to', local_filename)
  29. r = requests.get(url, stream=True)
  30. with open(local_filename, 'wb') as f:
  31. for chunk in r.iter_content(chunk_size=1024):
  32. if chunk:
  33. f.write(chunk)
  34. return local_filename
  35. def get_page(company_id, function_id, start=0, count=50):
  36. '''Gets a single page of results from linkedin for a particular job function at a company'''
  37. params = {
  38. 'facet': ['CC', 'FA'],
  39. 'facet.CC': company_id,
  40. 'facet.FA': function_id,
  41. 'count': count,
  42. 'start': start,
  43. }
  44. response = requests.get('https://www.linkedin.com/sales/search/results', headers=headers.headers, params=params)
  45. return response.json()
  46. def get_company(company_id, outname):
  47. '''Gets all employees from a company using particular job functions'''
  48. people = []
  49. for function_id in FUNCTION_FACETS:
  50. print('getting function', function_id, 'for company', company_id)
  51. count = 50
  52. start = 0
  53. results = get_page(company_id, function_id)
  54. total = results['pagination']['total']
  55. people += results['searchResults']
  56. start += count
  57. while start < total:
  58. print('getting', start, 'of', total)
  59. time.sleep(1)
  60. results = get_page(company_id, function_id, start)
  61. people += results['searchResults']
  62. start += count
  63. with open(outname, 'w') as outfile:
  64. json.dump(people, outfile, indent=2)
  65. return outname
  66. def get_images(datafile):
  67. '''Downloads profile images'''
  68. with open(datafile, 'r') as infile:
  69. people = json.load(infile)
  70. people = [p['member'] for p in people]
  71. for p in people:
  72. if 'vectorImage' not in p:
  73. continue
  74. pid = p['memberId']
  75. outname = 'images/{}.jpg'.format(pid)
  76. if os.path.exists(outname):
  77. print('skipping')
  78. continue
  79. url = p['vectorImage']['rootUrl']
  80. url += sorted(p['vectorImage']['artifacts'], key=lambda x: x['width'])[-1]['fileIdentifyingUrlPathSegment']
  81. print(url)
  82. download_file(url, outname)
  83. time.sleep(1)
  84. def get_profile(pid):
  85. '''Downloads individual profiles'''
  86. outname = 'profiles/{}.json'.format(pid)
  87. if os.path.exists(outname):
  88. return outname
  89. out = {}
  90. url = 'https://www.linkedin.com/sales/people/{},NAME_SEARCH'.format(pid)
  91. print(url)
  92. response = requests.get(url, headers=headers.headers)
  93. soup = BeautifulSoup(response.text, 'html.parser')
  94. codes = soup.select('code')
  95. for c in codes:
  96. try:
  97. d = json.loads(c.text)
  98. if 'contactInfo' in d:
  99. out = d
  100. break
  101. except Exception as e:
  102. continue
  103. with open(outname, 'w') as outfile:
  104. json.dump(out, outfile)
  105. time.sleep(1)
  106. return outname
  107. def get_profiles(datafile):
  108. '''Gets all profiles'''
  109. with open(datafile, 'r') as infile:
  110. data = json.load(infile)
  111. for d in data:
  112. pid = d['member']['profileId']
  113. get_profile(pid)
  114. def clean_and_parse(datafile, outname):
  115. '''Outputs csv, json and html from employee listings'''
  116. out = []
  117. mids = []
  118. with open(datafile, 'r') as infile:
  119. data = json.load(infile)
  120. for d in data:
  121. mid = d['member']['memberId']
  122. pid = d['member']['profileId']
  123. imgpath = 'images/{}.jpg'.format(mid)
  124. if not os.path.exists(imgpath):
  125. imgpath = None
  126. item = {
  127. 'name': d['member'].get('formattedName', ''),
  128. 'title': d['member'].get('title', ''),
  129. 'img': imgpath,
  130. 'company': d['company'].get('companyName', ''),
  131. 'location': d['member'].get('location', ''),
  132. 'id': d['member']['memberId'],
  133. 'linkedin': 'https://linkedin.com/in/' + pid,
  134. }
  135. if mid not in mids:
  136. out.append(item)
  137. mids.append(mid)
  138. with open(outname + '.json', 'w') as jsonfile:
  139. json.dump(out, jsonfile, indent=2)
  140. with open(outname + '.csv', 'w') as csvfile:
  141. fieldnames = list(out[0].keys())
  142. writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
  143. writer.writeheader()
  144. for row in out:
  145. writer.writerow(row)
  146. with open('template.html', 'r') as templatefile:
  147. template = Template(templatefile.read())
  148. html = template.render(people=out)
  149. with open('index.html', 'w') as htmlout:
  150. htmlout.write(html)
  151. if __name__ == '__main__':
  152. ICE = '533534'
  153. datafile = 'ice_raw.json'
  154. get_company(ICE, datafile)
  155. get_profiles(datafile)
  156. get_images(datafile)
  157. clean_and_parse(datafile, 'ice')