스크롤러2

[ 기타 활동 ]/파이썬

스크롤러2

유니시티황 2018. 6. 28. 03:01

from bs4 import BeautifulSoup

import requests

import urllib.parse

address = 'https://www.google.co.kr/search?num=100&ie=UTF-8&q=site:koreasw.org/wp-content/+아두이노+pdf'

resp = requests.get(address)

resp.raise_for_status()

resp.encoding='euc-kr'

html = resp.text

#html = bs.prettify()

bs = bs4.BeautifulSoup(html, 'html.parser')

for tags in bs.select('div.jfp3ef a'):

target= tags.get('href')

target= target.lstrip('/url?q=')

target= target.split('&')[0]

target= target.replace("%2520", " ")

target = urllib.parse.unquote(target) # 한글 화일명일 때

print(target)

from bs4 import BeautifulSoup

import requests

address = 'https://www.google.co.kr/search?num=100&ie=UTF-8&q=site:http://www.unicitypartner.cz/pdf/ pdf'

source = requests.get(address)

html = source.text

soup = BeautifulSoup(html,'lxml')

for tag in soup.select('h3[class=r]'):

target = tag.a['href']

target= target.lstrip('/url?q=')

target= target.split('&')[0]

target= target.replace("%2520", " ")

print(target)

from bs4 import BeautifulSoup

import requests

address = 'https://www.google.co.kr/search?num=100&ie=UTF-8&q=site:www.asamanthinketh.net/files/ pdf'

resp = requests.get(address)

resp.raise_for_status()

resp.encoding='euc-kr'

html = resp.text

#html = bs.prettify()

bs = bs4.BeautifulSoup(html, 'html.parser')

for tags in bs.select('div.jfp3ef a'):

target= tags.get('href')

target= target.lstrip('/url?q=')

target= target.split('&')[0]

target= target.replace("%2520", " ")

print(target)

from bs4 import BeautifulSoup

import requests

address = 'https://www.google.co.kr/search?num=3000&start=100&ie=UTF-8&q=site:https://cdn-learn.adafruit.com/downloads/pdf/'

# num = 3000 전체 검색 갯수

# start =100 : 페이지당 검색결과 100 으로 설정 했을시

# 두번째 페이지 시작

# =200 : 세번째 페이지

resp = requests.get(address)

resp.raise_for_status()

resp.encoding='euc-kr'

html = resp.text

#html = bs.prettify()

bs = bs4.BeautifulSoup(html, 'html.parser')

for tags in bs.select('div.jfp3ef a'): #div.jfp3ef : 얖 테그의 클래스 속성

target= tags.get('href')

target= target.lstrip('/url?q=')

target= target.split('&')[0]

target= target.replace("%2520", " ")

print(target)

from bs4 import BeautifulSoup

import requests

address = 'https://www.google.co.kr/search?num=3000&start=100&ie=UTF-8&q=site:https://cdn-learn.adafruit.com/downloads/pdf/'

resp = requests.get(address)

resp.raise_for_status()

resp.encoding='euc-kr'

html = resp.text

#html = bs.prettify()

bs = bs4.BeautifulSoup(html, 'html.parser')

for tags in bs.select('div.jfp3ef a'):

target= tags.get('href')

target= target.lstrip('/url?q=')

target= target.split('&')[0]

target= target.replace("%2520", " ")

if '%' in target:

target = target.split('%') # 검색화일 뒤에 붙는 것 제거 %3Ftimestamp%3D1561616644

target = target[0] # 검색화일 뒤에 붙는 것 제거 %3Ftimestamp%3D1563452703

print(target)

저작자표시 비영리 변경금지 (새창열림)