You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

173 lines
4.4 KiB

7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
  1. import time
  2. import json
  3. import csv
  4. import os
  5. import requests
  6. from bs4 import BeautifulSoup
  7. from jinja2 import Template
  8. import headers
  9. def download_file(url, local_filename=None):
  10. if local_filename is None:
  11. local_filename = url.split('/')[-1]
  12. print('saving to', local_filename)
  13. r = requests.get(url, stream=True)
  14. with open(local_filename, 'wb') as f:
  15. for chunk in r.iter_content(chunk_size=1024):
  16. if chunk:
  17. f.write(chunk)
  18. return local_filename
  19. def get_page(company_id, start=0, count=50):
  20. params = {
  21. 'facet': 'CC',
  22. 'facet.CC': company_id,
  23. 'count': count,
  24. 'start': start,
  25. }
  26. response = requests.get('https://www.linkedin.com/sales/search/results', headers=headers.headers, params=params)
  27. return response.json()
  28. def get_company(company_id, outname):
  29. count = 50
  30. start = 0
  31. results = get_page(company_id)
  32. total = results['pagination']['total']
  33. people = results['searchResults']
  34. start += count
  35. while start < total:
  36. print('getting', start, total)
  37. time.sleep(1)
  38. results = get_page(company_id, start)
  39. people += results['searchResults']
  40. start += count
  41. with open(outname, 'w') as outfile:
  42. json.dump(people, outfile, indent=2)
  43. return outname
  44. def get_images(datafile):
  45. with open(datafile, 'r') as infile:
  46. people = json.load(infile)
  47. people = [p['member'] for p in people]
  48. for p in people:
  49. if 'vectorImage' not in p:
  50. continue
  51. pid = p['memberId']
  52. outname = 'images/{}.jpg'.format(pid)
  53. if os.path.exists(outname):
  54. print('skipping')
  55. continue
  56. url = p['vectorImage']['rootUrl']
  57. url += sorted(p['vectorImage']['artifacts'], key=lambda x: x['width'])[-1]['fileIdentifyingUrlPathSegment']
  58. print(url)
  59. download_file(url, outname)
  60. time.sleep(1)
  61. def get_profile(pid):
  62. outname = 'profiles/{}.json'.format(pid)
  63. if os.path.exists(outname):
  64. return outname
  65. out = {}
  66. url = 'https://www.linkedin.com/sales/people/{},NAME_SEARCH'.format(pid)
  67. print(url)
  68. response = requests.get(url, headers=headers.headers)
  69. soup = BeautifulSoup(response.text, 'html.parser')
  70. codes = soup.select('code')
  71. for c in codes:
  72. try:
  73. d = json.loads(c.text)
  74. if 'contactInfo' in d:
  75. out = d
  76. break
  77. except Exception as e:
  78. continue
  79. with open(outname, 'w') as outfile:
  80. json.dump(out, outfile)
  81. time.sleep(1)
  82. return outname
  83. def get_profiles(datafile):
  84. with open(datafile, 'r') as infile:
  85. data = json.load(infile)
  86. for d in data:
  87. pid = d['member']['profileId']
  88. get_profile(pid)
  89. def clean_and_parse(datafile, outname):
  90. out = []
  91. with open(datafile, 'r') as infile:
  92. data = json.load(infile)
  93. for d in data:
  94. mid = d['member']['memberId']
  95. pid = d['member']['profileId']
  96. imgpath = 'images/{}.jpg'.format(mid)
  97. if not os.path.exists(imgpath):
  98. imgpath = None
  99. item = {
  100. 'name': d['member'].get('formattedName', ''),
  101. 'title': d['member'].get('title', ''),
  102. 'img': imgpath,
  103. 'company': d['company'].get('companyName', ''),
  104. 'location': d['member'].get('location', ''),
  105. 'id': d['member']['memberId'],
  106. 'linkedin': 'https://linkedin.com/in/' + pid,
  107. }
  108. # profile_file = 'profiles/{}.json'.format(pid)
  109. # if os.path.exists(profile_file):
  110. # with open(profile_file, 'r') as profilein:
  111. # profile = json.load(profilein)
  112. out.append(item)
  113. with open(outname + '.json', 'w') as jsonfile:
  114. json.dump(out, jsonfile, indent=2)
  115. with open(outname + '.csv', 'w') as csvfile:
  116. fieldnames = list(out[0].keys())
  117. writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
  118. writer.writeheader()
  119. for row in out:
  120. writer.writerow(row)
  121. with open('template.html', 'r') as templatefile:
  122. template = Template(templatefile.read())
  123. html = template.render(people=out)
  124. with open('index.html', 'w') as htmlout:
  125. htmlout.write(html)
  126. if __name__ == '__main__':
  127. ICE = '533534'
  128. datafile = 'ice_raw.json'
  129. get_company(ICE, datafile)
  130. get_profiles(datafile)
  131. get_images(datafile)
  132. clean_and_parse(datafile, 'ice')