Здесь показаны различия между двумя версиями данной страницы.
| Следующая версия | Предыдущая версия | ||
|
python:parsers:pagination [2020/03/22 17:17] werwolf создано |
python:parsers:pagination [2023/01/12 12:18] (текущий) |
||
|---|---|---|---|
| Строка 2: | Строка 2: | ||
| ====example 1==== | ====example 1==== | ||
| + | Фломатируя url пробежимся по нескольким страницам | ||
| <code python> | <code python> | ||
| import requests | import requests | ||
| Строка 79: | Строка 80: | ||
| </code> | </code> | ||
| + | ====example 2==== | ||
| + | |||
| + | [[https://coinmarketcap.com/|coinmarketcap]] | ||
| + | {{ :python:parsers:parse2.png |}} | ||
| + | |||
| + | <code python> | ||
| + | import requests | ||
| + | from bs4 import BeautifulSoup | ||
| + | import csv | ||
| + | import re | ||
| + | |||
| + | |||
| + | def get_html(url): | ||
| + | r = requests.get(url) | ||
| + | if r.ok: | ||
| + | return r.text | ||
| + | print(r.status_code) | ||
| + | |||
| + | |||
| + | def write_csv(data): | ||
| + | with open('cmc.csv', 'a') as f: | ||
| + | writer = csv.writer(f) | ||
| + | writer.writerow((data['name'], data['url'], data['price'])) | ||
| + | |||
| + | def get_page_data(html): | ||
| + | soup = BeautifulSoup(html, 'lxml') | ||
| + | |||
| + | trs = soup.find_all('tr', class_='cmc-table-row') | ||
| + | for tr in trs: | ||
| + | tds = tr.find_all('td') | ||
| + | |||
| + | try: | ||
| + | name = tds[1].find('a', class_='cmc-link').text.strip() | ||
| + | except: | ||
| + | name = '' | ||
| + | |||
| + | try: | ||
| + | url = 'https://coinmarketcap.com' + tds[1].find('a', class_='cmc-link').get('href') | ||
| + | except: | ||
| + | url = '' | ||
| + | try: | ||
| + | price = tds[3].find('a').text.strip() | ||
| + | except: | ||
| + | price = 'price: null' | ||
| + | |||
| + | data = {'name': name, | ||
| + | 'url': url, | ||
| + | 'price': price} | ||
| + | |||
| + | write_csv(data) | ||
| + | |||
| + | def main(): | ||
| + | url = 'https://coinmarketcap.com/' | ||
| + | |||
| + | while True: | ||
| + | get_page_data(get_html(url)) | ||
| + | |||
| + | soup = BeautifulSoup(get_html(url), 'lxml') | ||
| + | try: | ||
| + | pattern = 'Next' | ||
| + | url = 'https://coinmarketcap.com/' + soup.find('div', class_='cmc-button-group').find('a', text=re.compile(pattern)).get('href') | ||
| + | except: | ||
| + | break | ||
| + | |||
| + | if __name__ == '__main__': | ||
| + | main() | ||
| + | |||
| + | </code> | ||