Aquí vamos a introducir, con un ejemplo, como obtener datos de una web
# Importamos los paquetes
from requests_html import HTML
with open(r"F:\curso_python_uruguay\lectures\lecture5_scrapping\simple.html") as file_html:
source = file_html.read()
html = HTML(html = source)
type(html)
#print(html.html)
print(html.text)
match = html.find("title") #css selector
print(match)
type(match)
print(match[0])
print(match[0].text)
dir(match[0])
# Si solo queremos buscar el primero
match = html.find("title", first = True)
print(match)
# find utiliza css selectors: ver https://www.w3schools.com/cssref/css_selectors.php
# que pasa si queremos ver el
# Si solo queremos buscar el primero
match = html.find("#site_title", first = True)
print(match.text)
# article headlines
# Si solo queremos buscar el primero
match = html.find("div.article", first = True)
print(match.text)
# quiero cada parte por separado
print(match.html)
head = match.find("h2", first = True)
print(head.text)
resumen = match.find("p", first = True)
print(resumen.text)
# que pasa sino pongo first
match = html.find("div.article")
print(match)
for artic in match:
head = artic.find("h2", first = True)
print(head.text)
resumen = artic.find("p", first = True)
print(resumen.text)
print("\n")
from bs4 import BeautifulSoup
import requests
with open(r"F:\curso_python_uruguay\lectures\lecture5_scrapping\simple.html") as file_html:
soup = BeautifulSoup(file_html, 'lxml')
print(soup)
print(soup.prettify())
match = soup.title
print(match)
print(match.text)
# Coge la primera ocurrencia
match = soup.div
print(match)
# Coge la primera ocurrencia
match = soup.find("div")
print(match)
match = soup.find("div", class_ = "article")
print(match)
# Obtener el titulo del artículo
match = soup.find(id = "site_title")
print(match)
#obtener el titulo del articul
article = soup.find("div", class_ = "article")
titulo = article.h2.a.text
print(titulo)
resumen = article.p.text
print(resumen)
# Si queremos el titulo de todos los articulos
article = soup.find_all("div", class_ = "article")
article
# es una lista
for art in soup.find_all("div", class_ = "article"):
titulo = art.h2.a.text
print(titulo)