You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

192 lines
4.9 KiB

7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
  1. import time
  2. import json
  3. import csv
  4. import os
  5. import requests
  6. from bs4 import BeautifulSoup
  7. from jinja2 import Template
  8. import headers
  9. FUNCTION_FACETS = [
  10. 17,
  11. 18,
  12. 14,
  13. 2,
  14. 4,
  15. 20,
  16. 5,
  17. 13,
  18. 12,
  19. 26,
  20. ]
  21. def download_file(url, local_filename=None):
  22. if local_filename is None:
  23. local_filename = url.split('/')[-1]
  24. print('saving to', local_filename)
  25. r = requests.get(url, stream=True)
  26. with open(local_filename, 'wb') as f:
  27. for chunk in r.iter_content(chunk_size=1024):
  28. if chunk:
  29. f.write(chunk)
  30. return local_filename
  31. def get_page(company_id, function_id, start=0, count=50):
  32. # facet.FA 17
  33. params = {
  34. 'facet': ['CC', 'FA'],
  35. 'facet.CC': company_id,
  36. 'facet.FA': function_id,
  37. 'count': count,
  38. 'start': start,
  39. }
  40. response = requests.get('https://www.linkedin.com/sales/search/results', headers=headers.headers, params=params)
  41. return response.json()
  42. def get_company(company_id, outname):
  43. people = []
  44. for function_id in FUNCTION_FACETS:
  45. print('getting function', function_id, 'for company', company_id)
  46. count = 50
  47. start = 0
  48. results = get_page(company_id, function_id)
  49. total = results['pagination']['total']
  50. people += results['searchResults']
  51. start += count
  52. while start < total:
  53. print('getting', start, 'of', total)
  54. time.sleep(1)
  55. results = get_page(company_id, function_id, start)
  56. people += results['searchResults']
  57. start += count
  58. with open(outname, 'w') as outfile:
  59. json.dump(people, outfile, indent=2)
  60. return outname
  61. def get_images(datafile):
  62. with open(datafile, 'r') as infile:
  63. people = json.load(infile)
  64. people = [p['member'] for p in people]
  65. for p in people:
  66. if 'vectorImage' not in p:
  67. continue
  68. pid = p['memberId']
  69. outname = 'images/{}.jpg'.format(pid)
  70. if os.path.exists(outname):
  71. print('skipping')
  72. continue
  73. url = p['vectorImage']['rootUrl']
  74. url += sorted(p['vectorImage']['artifacts'], key=lambda x: x['width'])[-1]['fileIdentifyingUrlPathSegment']
  75. print(url)
  76. download_file(url, outname)
  77. time.sleep(1)
  78. def get_profile(pid):
  79. outname = 'profiles/{}.json'.format(pid)
  80. if os.path.exists(outname):
  81. return outname
  82. out = {}
  83. url = 'https://www.linkedin.com/sales/people/{},NAME_SEARCH'.format(pid)
  84. print(url)
  85. response = requests.get(url, headers=headers.headers)
  86. soup = BeautifulSoup(response.text, 'html.parser')
  87. codes = soup.select('code')
  88. for c in codes:
  89. try:
  90. d = json.loads(c.text)
  91. if 'contactInfo' in d:
  92. out = d
  93. break
  94. except Exception as e:
  95. continue
  96. with open(outname, 'w') as outfile:
  97. json.dump(out, outfile)
  98. time.sleep(1)
  99. return outname
  100. def get_profiles(datafile):
  101. with open(datafile, 'r') as infile:
  102. data = json.load(infile)
  103. for d in data:
  104. pid = d['member']['profileId']
  105. get_profile(pid)
  106. def clean_and_parse(datafile, outname):
  107. out = []
  108. with open(datafile, 'r') as infile:
  109. data = json.load(infile)
  110. for d in data:
  111. mid = d['member']['memberId']
  112. pid = d['member']['profileId']
  113. imgpath = 'images/{}.jpg'.format(mid)
  114. if not os.path.exists(imgpath):
  115. imgpath = None
  116. item = {
  117. 'name': d['member'].get('formattedName', ''),
  118. 'title': d['member'].get('title', ''),
  119. 'img': imgpath,
  120. 'company': d['company'].get('companyName', ''),
  121. 'location': d['member'].get('location', ''),
  122. 'id': d['member']['memberId'],
  123. 'linkedin': 'https://linkedin.com/in/' + pid,
  124. }
  125. # profile_file = 'profiles/{}.json'.format(pid)
  126. # if os.path.exists(profile_file):
  127. # with open(profile_file, 'r') as profilein:
  128. # profile = json.load(profilein)
  129. if mid not in out:
  130. out.append(item)
  131. with open(outname + '.json', 'w') as jsonfile:
  132. json.dump(out, jsonfile, indent=2)
  133. with open(outname + '.csv', 'w') as csvfile:
  134. fieldnames = list(out[0].keys())
  135. writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
  136. writer.writeheader()
  137. for row in out:
  138. writer.writerow(row)
  139. with open('template.html', 'r') as templatefile:
  140. template = Template(templatefile.read())
  141. html = template.render(people=out)
  142. with open('index.html', 'w') as htmlout:
  143. htmlout.write(html)
  144. if __name__ == '__main__':
  145. ICE = '533534'
  146. datafile = 'ice_raw.json'
  147. get_company(ICE, datafile)
  148. get_profiles(datafile)
  149. get_images(datafile)
  150. clean_and_parse(datafile, 'ice')