Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

How to scrape data from page using beautifulsoup

import requests
from bs4 import BeautifulSoup
import pandas as pd
baseurl='https://signal.nfx.com/'
headers ={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
}
r =requests.get('https://signal.nfx.com/investor-lists/top-who-invested-in-female-founders-investors')
soup=BeautifulSoup(r.content, 'html.parser')
tra = soup.find_all('div',class_='pr3')
productlinks=[]
p=[]
u=[]
for links in tra:
    for link in links.find_all('a',href=True):
        comp=baseurl+link['href']
        productlinks.append(comp)

for link in productlinks:
    r =requests.get(link,headers=headers)
    soup=BeautifulSoup(r.content, 'html.parser')
    try:
        address=soup.find('span',class_='ml1').text
    except:
        address=''
    p.append(address)
    try:
        link=soup.find('a',class_='ml1 subheader lower-subheader').text
    except:
        link=''
    u.append(link)

df = pd.DataFrame(
    
    {"address": p, "link": u}
)
print(df)

This is my output that give me only one address and link and then they will print empty list and finish the task can you help to find all the addresses and link I am try to scrape data from page but they will not provide complete information of the page these is link in which I scrape information https://signal.nfx.com/investors/aaleen-anjum

     address             link
0   Toronto, Ontario  twosmallfish.vc
1                                    
2                                    
3                                    
4                                    
5                                    
6                                    
7                                    
8                                    
9                                    
10                                   
11                                   
like image 871
Amen Aziz Avatar asked Jan 25 '26 02:01

Amen Aziz


1 Answers

You can get the data through the api. For example, here's the investors:

import requests
import pandas as pd

url= "https://signal-api.nfx.com/graphql"
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'}
payload = {"operationName":"vclInvestors",
           "variables":{"slug":"who-invested-in-female-founders",
                        "order":[{}],
                        "after":""},
           "query":"query vclInvestors($slug: String!, $after: String) {\n  list(slug: $slug) {\n    id\n    slug\n    investor_count\n    vertical {\n      id\n      display_name\n      kind\n      __typename\n    }\n    location {\n      id\n      display_name\n      __typename\n    }\n    stage\n    firms {\n      id\n      name\n      slug\n      __typename\n    }\n    scored_investors(first: 8, after: $after) {\n      pageInfo {\n        hasNextPage\n        hasPreviousPage\n        endCursor\n        __typename\n      }\n      record_count\n      edges {\n        node {\n          ...investorListInvestorProfileFields\n          __typename\n        }\n        __typename\n      }\n      __typename\n    }\n    __typename\n  }\n}\n\nfragment investorListInvestorProfileFields on InvestorProfile {\n  id\n  person {\n    id\n    first_name\n    last_name\n    name\n    slug\n    is_me\n    is_on_target_list\n    __typename\n  }\n  image_urls\n  position\n  min_investment\n  max_investment\n  target_investment\n  is_preferred_coinvestor\n  firm {\n    id\n    name\n    slug\n    __typename\n  }\n  investment_locations {\n    id\n    display_name\n    location_investor_list {\n      id\n      slug\n      __typename\n    }\n    __typename\n  }\n  investor_lists {\n    id\n    stage_name\n    slug\n    vertical {\n      id\n      display_name\n      __typename\n    }\n    __typename\n  }\n  __typename\n}\n"}


results = pd.DataFrame()
hasNextPage = True
after = ''

while hasNextPage == True:
    payload['variables']['after'] == after
    jsonData = requests.post(url, headers=headers, json=payload ).json()
    data = jsonData['data']['list']['scored_investors']['edges']
    df = pd.json_normalize(data)
    results = results.append(df, sort=False).reset_index(drop=True)
    
    count = len(results) 
    tot = jsonData['data']['list']['investor_count']
    
    print(f'{count} of {tot}')
    
    hasNextPage = jsonData['data']['list']['scored_investors']['pageInfo']['hasNextPage']
    after = jsonData['data']['list']['scored_investors']['pageInfo']['endCursor']

Output:

print(results.head(2).to_string())
            __typename  node.__typename node.id node.person.id node.person.first_name node.person.last_name node.person.name node.person.slug  node.person.is_me  node.person.is_on_target_list node.person.__typename                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           node.image_urls node.position  node.min_investment  node.max_investment  node.target_investment  node.is_preferred_coinvestor node.firm.id           node.firm.name           node.firm.slug node.firm.__typename                                                                                                                                                                                                                                                                                node.investment_locations                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 node.investor_lists  node.firm
0  InvestorProfileEdge  InvestorProfile   19676          87099                 Aaleen                 Anjum     Aaleen Anjum     aaleen-anjum              False                          False                 Person  [https://signal-api.nfx.com/rails/active_storage/representations/redirect/eyJfcmFpbHMiOnsibWVzc2FnZSI6IkJBaHBBMWp2QVE9PSIsImV4cCI6bnVsbCwicHVyIjoiYmxvYl9pZCJ9fQ==--1dc8054880c588f1fd59361ebd5d8526f841049d/eyJfcmFpbHMiOnsibWVzc2FnZSI6IkJBaDdCem9MWm05eWJXRjBPZ2hxY0djNkUzSmxjMmw2WlY5MGIxOW1hV3hzV3dkcEFsZ0NhUUpZQWc9PSIsImV4cCI6bnVsbCwicHVyIjoidmFyaWF0aW9uIn19--f8e22238db523e6e5e5a8ae643921849c4b207bd/0, https://signal-api.nfx.com/rails/active_storage/representations/redirect/eyJfcmFpbHMiOnsibWVzc2FnZSI6IkJBaHBBMW52QVE9PSIsImV4cCI6bnVsbCwicHVyIjoiYmxvYl9pZCJ9fQ==--df77fc9ad679d550ce8e2472e47150cb9fc610e6/eyJfcmFpbHMiOnsibWVzc2FnZSI6IkJBaDdCem9MWm05eWJXRjBPZ2hxY0djNkUzSmxjMmw2WlY5MGIxOW1hV3hzV3dkcEFsZ0NhUUpZQWc9PSIsImV4cCI6bnVsbCwicHVyIjoidmFyaWF0aW9uIn19--f8e22238db523e6e5e5a8ae643921849c4b207bd/1, https://signal-api.nfx.com/rails/active_storage/representations/redirect/eyJfcmFpbHMiOnsibWVzc2FnZSI6IkJBaHBBMXJ2QVE9PSIsImV4cCI6bnVsbCwicHVyIjoiYmxvYl9pZCJ9fQ==--1f58605b9a843b9ee1e820d63d154aea24936f84/eyJfcmFpbHMiOnsibWVzc2FnZSI6IkJBaDdCem9MWm05eWJXRjBPZ2hxY0djNkUzSmxjMmw2WlY5MGIxOW1hV3hzV3dkcEFsZ0NhUUpZQWc9PSIsImV4cCI6bnVsbCwicHVyIjoidmFyaWF0aW9uIn19--f8e22238db523e6e5e5a8ae643921849c4b207bd/2, https://signal-api.nfx.com/rails/active_storage/representations/redirect/eyJfcmFpbHMiOnsibWVzc2FnZSI6IkJBaHBBMXZ2QVE9PSIsImV4cCI6bnVsbCwicHVyIjoiYmxvYl9pZCJ9fQ==--2a200a001411bbff92bd9deb68b4a54215ee0863/eyJfcmFpbHMiOnsibWVzc2FnZSI6IkJBaDdCem9MWm05eWJXRjBPZ2hxY0djNkUzSmxjMmw2WlY5MGIxOW1hV3hzV3dkcEFsZ0NhUUpZQWc9PSIsImV4cCI6bnVsbCwicHVyIjoidmFyaWF0aW9uIn19--f8e22238db523e6e5e5a8ae643921849c4b207bd/3]       analyst               150000              1000000                  250000                         False         4445  Two Small Fish Ventures  two-small-fish-ventures                 Firm                                                                                                                                                                                                                                                                                                       []  [{'id': '6141', 'stage_name': 'Pre-Seed', 'slug': 'ai-pre-seed', 'vertical': {'id': '3', 'display_name': 'AI', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '3', 'stage_name': 'Seed', 'slug': 'ai-seed', 'vertical': {'id': '3', 'display_name': 'AI', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '6144', 'stage_name': 'Pre-Seed', 'slug': 'blockchain-pre-seed', 'vertical': {'id': '7', 'display_name': 'Blockchain', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '7', 'stage_name': 'Seed', 'slug': 'blockchain-seed', 'vertical': {'id': '7', 'display_name': 'Blockchain', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '5406', 'stage_name': 'Other Lists', 'slug': 'british-columbia', 'vertical': {'id': '9678', 'display_name': 'British Columbia', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '6169', 'stage_name': 'Pre-Seed', 'slug': 'consumer-health-pre-seed', 'vertical': {'id': '11', 'display_name': 'Consumer Health', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '11', 'stage_name': 'Seed', 'slug': 'consumer-health-seed', 'vertical': {'id': '11', 'display_name': 'Consumer Health', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '6145', 'stage_name': 'Pre-Seed', 'slug': 'cryptocurrency-pre-seed', 'vertical': {'id': '13', 'display_name': 'Cryptocurrency', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '13', 'stage_name': 'Seed', 'slug': 'cryptocurrency-seed', 'vertical': {'id': '13', 'display_name': 'Cryptocurrency', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '6203', 'stage_name': 'Pre-Seed', 'slug': 'cybersecurity-pre-seed', 'vertical': {'id': '57799', 'display_name': 'Cybersecurity', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '5554', 'stage_name': 'Seed', 'slug': 'cybersecurity-seed', 'vertical': {'id': '57799', 'display_name': 'Cybersecurity', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '6172', 'stage_name': 'Pre-Seed', 'slug': 'developer-tools-pre-seed', 'vertical': {'id': '15', 'display_name': 'Developer Tools', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '15', 'stage_name': 'Seed', 'slug': 'developer-tools-seed', 'vertical': {'id': '15', 'display_name': 'Developer Tools', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '6205', 'stage_name': 'Pre-Seed', 'slug': 'digital-health-pre-seed', 'vertical': {'id': '57801', 'display_name': 'Digital Health', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '5644', 'stage_name': 'Seed', 'slug': 'digital-health-seed', 'vertical': {'id': '57801', 'display_name': 'Digital Health', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '6207', 'stage_name': 'Pre-Seed', 'slug': 'direct-to-consumer-dtc-pre-seed', 'vertical': {'id': '57803', 'display_name': 'Direct-to-Consumer (DTC)', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '5734', 'stage_name': 'Seed', 'slug': 'direct-to-consumer-dtc-seed', 'vertical': {'id': '57803', 'display_name': 'Direct-to-Consumer (DTC)', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '171', 'stage_name': 'Other Lists', 'slug': 'diverse', 'vertical': {'id': '24242', 'display_name': 'Diverse Investors', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '6148', 'stage_name': 'Pre-Seed', 'slug': 'enterprise-pre-seed', 'vertical': {'id': '20', 'display_name': 'Enterprise', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '20', 'stage_name': 'Seed', 'slug': 'enterprise-seed', 'vertical': {'id': '20', 'display_name': 'Enterprise', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '172', 'stage_name': 'Other Lists', 'slug': 'female', 'vertical': {'id': '24241', 'display_name': 'Female Investors', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '6162', 'stage_name': 'Pre-Seed', 'slug': 'saas-pre-seed', 'vertical': {'id': '48', 'display_name': 'SaaS', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '47', 'stage_name': 'Seed', 'slug': 'saas-seed', 'vertical': {'id': '48', 'display_name': 'SaaS', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '169', 'stage_name': 'Other Lists', 'slug': 'who-invested-in-diverse-founders', 'vertical': {'id': '24244', 'display_name': 'Investors who invested in diverse founders', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '170', 'stage_name': 'Other Lists', 'slug': 'who-invested-in-female-founders', 'vertical': {'id': '24243', 'display_name': 'Investors who invested in female founders', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '187', 'stage_name': 'Other Lists', 'slug': 'who-were-founders', 'vertical': {'id': '24387', 'display_name': 'Investors who were founders', '__typename': 'Tag'}, '__typename': 'InvestorList'}]        NaN
1  InvestorProfileEdge  InvestorProfile   13187          29548                  Aamir                Virani     Aamir Virani     aamir-virani              False                          False                 Person                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        [https://signal-api.nfx.com/rails/active_storage/representations/redirect/eyJfcmFpbHMiOnsibWVzc2FnZSI6IkJBaHBBeTJHQVE9PSIsImV4cCI6bnVsbCwicHVyIjoiYmxvYl9pZCJ9fQ==--a7cd75f799cb3eb96a06cbd6b67d287971185953/eyJfcmFpbHMiOnsibWVzc2FnZSI6IkJBaDdCem9MWm05eWJXRjBPZ2hxY0djNkUzSmxjMmw2WlY5MGIxOW1hV3hzV3dkcEFsZ0NhUUpZQWc9PSIsImV4cCI6bnVsbCwicHVyIjoidmFyaWF0aW9uIn19--f8e22238db523e6e5e5a8ae643921849c4b207bd/0]         angel                    1               100000                   25000                         False          NaN                      NaN                      NaN                  NaN  [{'id': '7500', 'display_name': 'California', 'location_investor_list': None, '__typename': 'Tag'}, {'id': '7502', 'display_name': 'Texas', 'location_investor_list': None, '__typename': 'Tag'}, {'id': '7498', 'display_name': 'United States', 'location_investor_list': None, '__typename': 'Tag'}]                                                                                                                                                                                                                                                                                                                                                                                                                                         [{'id': '6141', 'stage_name': 'Pre-Seed', 'slug': 'ai-pre-seed', 'vertical': {'id': '3', 'display_name': 'AI', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '3', 'stage_name': 'Seed', 'slug': 'ai-seed', 'vertical': {'id': '3', 'display_name': 'AI', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '6170', 'stage_name': 'Pre-Seed', 'slug': 'consumer-internet-pre-seed', 'vertical': {'id': '12', 'display_name': 'Consumer Internet', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '12', 'stage_name': 'Seed', 'slug': 'consumer-internet-seed', 'vertical': {'id': '12', 'display_name': 'Consumer Internet', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '6152', 'stage_name': 'Pre-Seed', 'slug': 'hardware-pre-seed', 'vertical': {'id': '28', 'display_name': 'Hardware', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '28', 'stage_name': 'Seed', 'slug': 'hardware-seed', 'vertical': {'id': '28', 'display_name': 'Hardware', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '6156', 'stage_name': 'Pre-Seed', 'slug': 'iot-pre-seed', 'vertical': {'id': '34', 'display_name': 'IoT', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '33', 'stage_name': 'Seed', 'slug': 'iot-seed', 'vertical': {'id': '34', 'display_name': 'IoT', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '6179', 'stage_name': 'Pre-Seed', 'slug': 'local-services-pre-seed', 'vertical': {'id': '35', 'display_name': 'Local Services', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '34', 'stage_name': 'Seed', 'slug': 'local-services-seed', 'vertical': {'id': '35', 'display_name': 'Local Services', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '6185', 'stage_name': 'Pre-Seed', 'slug': 'parenting-families-pre-seed', 'vertical': {'id': '43', 'display_name': 'Parenting/Families', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '42', 'stage_name': 'Seed', 'slug': 'parenting-families-seed', 'vertical': {'id': '43', 'display_name': 'Parenting/Families', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '6199', 'stage_name': 'Pre-Seed', 'slug': 'real-estate-proptech-pre-seed', 'vertical': {'id': '45', 'display_name': 'Real Estate/PropTech', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '44', 'stage_name': 'Seed', 'slug': 'real-estate-proptech-seed', 'vertical': {'id': '45', 'display_name': 'Real Estate/PropTech', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '6161', 'stage_name': 'Pre-Seed', 'slug': 'robotics-pre-seed', 'vertical': {'id': '47', 'display_name': 'Robotics', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '46', 'stage_name': 'Seed', 'slug': 'robotics-seed', 'vertical': {'id': '47', 'display_name': 'Robotics', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '6162', 'stage_name': 'Pre-Seed', 'slug': 'saas-pre-seed', 'vertical': {'id': '48', 'display_name': 'SaaS', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '47', 'stage_name': 'Seed', 'slug': 'saas-seed', 'vertical': {'id': '48', 'display_name': 'SaaS', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '188', 'stage_name': 'Other Lists', 'slug': 'san-francisco-bay-area', 'vertical': {'id': '22992', 'display_name': 'San Francisco Bay Area', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '6187', 'stage_name': 'Pre-Seed', 'slug': 'smb-software-pre-seed', 'vertical': {'id': '51', 'display_name': 'SMB Software', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '50', 'stage_name': 'Seed', 'slug': 'smb-software-seed', 'vertical': {'id': '51', 'display_name': 'SMB Software', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '169', 'stage_name': 'Other Lists', 'slug': 'who-invested-in-diverse-founders', 'vertical': {'id': '24244', 'display_name': 'Investors who invested in diverse founders', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '170', 'stage_name': 'Other Lists', 'slug': 'who-invested-in-female-founders', 'vertical': {'id': '24243', 'display_name': 'Investors who invested in female founders', '__typename': 'Tag'}, '__typename': 'InvestorList'}, {'id': '187', 'stage_name': 'Other Lists', 'slug': 'who-were-founders', 'vertical': {'id': '24387', 'display_name': 'Investors who were founders', '__typename': 'Tag'}, '__typename': 'InvestorList'}]        NaN
like image 109
chitown88 Avatar answered Jan 26 '26 18:01

chitown88



Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!