You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

171 lines
4.4 KiB

7 years ago
  1. import time
  2. import json
  3. import csv
  4. import os
  5. import requests
  6. from bs4 import BeautifulSoup
  7. import headers
  8. def download_file(url, local_filename=None):
  9. if local_filename is None:
  10. local_filename = url.split('/')[-1]
  11. print('saving to', local_filename)
  12. r = requests.get(url, stream=True)
  13. with open(local_filename, 'wb') as f:
  14. for chunk in r.iter_content(chunk_size=1024):
  15. if chunk:
  16. f.write(chunk)
  17. return local_filename
  18. def get_page(company_id, start=0, count=50):
  19. params = {
  20. 'facet': 'CC',
  21. 'facet.CC': company_id,
  22. 'count': count,
  23. 'start': start,
  24. }
  25. response = requests.get('https://www.linkedin.com/sales/search/results', headers=headers.headers, params=params)
  26. return response.json()
  27. def get_company(company_id, outname):
  28. count = 50
  29. start = 0
  30. results = get_page(company_id)
  31. total = results['pagination']['total']
  32. people = results['searchResults']
  33. start += count
  34. while start < total:
  35. print('getting', start, total)
  36. time.sleep(1)
  37. results = get_page(company_id, start)
  38. people += results['searchResults']
  39. start += count
  40. with open(outname, 'w') as outfile:
  41. json.dump(people, outfile, indent=2)
  42. return outname
  43. def get_images(datafile):
  44. with open(datafile, 'r') as infile:
  45. people = json.load(infile)
  46. people = [p['member'] for p in people]
  47. for p in people:
  48. if 'vectorImage' not in p:
  49. continue
  50. pid = p['memberId']
  51. outname = 'images/{}.jpg'.format(pid)
  52. if os.path.exists(outname):
  53. print('skipping')
  54. continue
  55. url = p['vectorImage']['rootUrl']
  56. url += sorted(p['vectorImage']['artifacts'], key=lambda x: x['width'])[-1]['fileIdentifyingUrlPathSegment']
  57. print(url)
  58. download_file(url, outname)
  59. time.sleep(1)
  60. def get_profile(pid):
  61. outname = 'profiles/{}.json'.format(pid)
  62. if os.path.exists(outname):
  63. return outname
  64. out = {}
  65. url = 'https://www.linkedin.com/sales/people/{},NAME_SEARCH'.format(pid)
  66. print(url)
  67. response = requests.get(url, headers=headers.headers)
  68. soup = BeautifulSoup(response.text, 'html.parser')
  69. codes = soup.select('code')
  70. for c in codes:
  71. try:
  72. d = json.loads(c.text)
  73. if 'contactInfo' in d:
  74. out = d
  75. break
  76. except Exception as e:
  77. continue
  78. with open(outname, 'w') as outfile:
  79. json.dump(out, outfile)
  80. time.sleep(1)
  81. return outname
  82. def get_profiles(datafile):
  83. with open(datafile, 'r') as infile:
  84. data = json.load(infile)
  85. for d in data:
  86. pid = d['member']['profileId']
  87. get_profile(pid)
  88. def clean_and_parse(datafile, outname):
  89. out = []
  90. with open(datafile, 'r') as infile:
  91. data = json.load(infile)
  92. for d in data:
  93. mid = d['member']['memberId']
  94. pid = d['member']['profileId']
  95. imgpath = 'images/{}.jpg'.format(mid)
  96. if not os.path.exists(imgpath):
  97. imgpath = None
  98. item = {
  99. 'name': d['member'].get('formattedName', ''),
  100. 'title': d['member'].get('title', ''),
  101. 'img': imgpath,
  102. 'company': d['company'].get('companyName', ''),
  103. 'location': d['member'].get('location', ''),
  104. 'id': d['member']['memberId'],
  105. 'linkedin': 'https://linkedin.com/in/' + pid,
  106. }
  107. # profile_file = 'profiles/{}.json'.format(pid)
  108. # if os.path.exists(profile_file):
  109. # with open(profile_file, 'r') as profilein:
  110. # profile = json.load(profilein)
  111. out.append(item)
  112. with open(outname + '.json', 'w') as jsonfile:
  113. json.dump(out, jsonfile, indent=2)
  114. with open(outname + '.csv', 'w') as csvfile:
  115. fieldnames = ['name', 'title', 'location', 'url']
  116. writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
  117. writer.writeheader()
  118. for row in out:
  119. writer.writerow({
  120. 'name': row['name'],
  121. 'title': row['title'],
  122. 'location': row['location'],
  123. 'url': row['linkedin']
  124. })
  125. if __name__ == '__main__':
  126. ICE = '533534'
  127. datafile = 'ice_raw.json'
  128. # get_company(ICE, datafile)
  129. # get_profiles(datafile)
  130. # get_images(datafile)
  131. clean_and_parse(datafile, 'ice')