Фломатируя url пробежимся по нескольким страницам
import requests from bs4 import BeautifulSoup import csv def get_html(url): r = requests.get(url) if r.ok: # 200 ## 403 404 return r.text print(r.status_code) def refine_cy(s): # ТИЦ: 500000 -> ['ТИЦ:', '500000'] return s.split(' ')[-1] def write_csv(data): with open('yaca.csv', 'a') as f: writer = csv.writer(f) writer.writerow((data['name'], data['url'], data['snippet'], data['cy'])) def get_page_data(html): soup = BeautifulSoup(html, 'lxml') lis = soup.find_all('li', class_='yaca-snippet') for li in lis: try: name = li.find('h2').text except: name = '' try: url = li.find('h2').find('a').get('href') except: url = '' try: snippet = li.find('div', class_='yaca-snippet__text').text.strip() except: snippet = '' try: c = li.find('div', class_='yaca-snippet__cy').text.strip() cy = refine_cy(c) except: cy = '' data = {'name': name, 'url': url, 'snippet': snippet, 'cy': cy} write_csv(data) def main(): pattern = 'https://yandex.ru/yaca/cat/Entertainment/{}.html' for i in range(0, 5): url = pattern.format(str(i)) get_page_data(get_html(url)) if __name__ == '__main__': main()
import requests from bs4 import BeautifulSoup import csv import re def get_html(url): r = requests.get(url) if r.ok: return r.text print(r.status_code) def write_csv(data): with open('cmc.csv', 'a') as f: writer = csv.writer(f) writer.writerow((data['name'], data['url'], data['price'])) def get_page_data(html): soup = BeautifulSoup(html, 'lxml') trs = soup.find_all('tr', class_='cmc-table-row') for tr in trs: tds = tr.find_all('td') try: name = tds[1].find('a', class_='cmc-link').text.strip() except: name = '' try: url = 'https://coinmarketcap.com' + tds[1].find('a', class_='cmc-link').get('href') except: url = '' try: price = tds[3].find('a').text.strip() except: price = 'price: null' data = {'name': name, 'url': url, 'price': price} write_csv(data) def main(): url = 'https://coinmarketcap.com/' while True: get_page_data(get_html(url)) soup = BeautifulSoup(get_html(url), 'lxml') try: pattern = 'Next' url = 'https://coinmarketcap.com/' + soup.find('div', class_='cmc-button-group').find('a', text=re.compile(pattern)).get('href') except: break if __name__ == '__main__': main()