[ 기타 활동 ]/파이썬

스크롤러2

유니시티황 2018. 6. 28. 03:01

from bs4 import BeautifulSoup

import requests

import urllib.parse


address = 'https://www.google.co.kr/search?num=100&ie=UTF-8&q=site:koreasw.org/wp-content/+아두이노+pdf'


resp = requests.get(address)

resp.raise_for_status()

resp.encoding='euc-kr'


html = resp.text


#html = bs.prettify()


bs = bs4.BeautifulSoup(html, 'html.parser')


for tags in bs.select('div.jfp3ef a'): 

    target= tags.get('href')

    target= target.lstrip('/url?q=')

    target= target.split('&')[0]

    target= target.replace("%2520", " ")

    

    target = urllib.parse.unquote(target) # 한글 화일명일 때

    target = urllib.parse.unquote(target) # 한글 화일명일 때


    print(target)




from bs4 import BeautifulSoup

import requests



address = 'https://www.google.co.kr/search?num=100&ie=UTF-8&q=site:http://www.unicitypartner.cz/pdf/ pdf'



source = requests.get(address)

html = source.text


soup = BeautifulSoup(html,'lxml')


for tag in soup.select('h3[class=r]'):

    target = tag.a['href']

    target= target.lstrip('/url?q=')

    target= target.split('&')[0]

    target= target.replace("%2520", " ")

    print(target)




from bs4 import BeautifulSoup

import requests


address = 'https://www.google.co.kr/search?num=100&ie=UTF-8&q=site:www.asamanthinketh.net/files/ pdf'


resp = requests.get(address)

resp.raise_for_status()

 

resp.encoding='euc-kr'

html = resp.text


#html = bs.prettify()


bs = bs4.BeautifulSoup(html, 'html.parser')


for tags in bs.select('div.jfp3ef a'): 

    target= tags.get('href')

    target= target.lstrip('/url?q=')

    target= target.split('&')[0]

    target= target.replace("%2520", " ")

    print(target)





from bs4 import BeautifulSoup

import requests


address = 'https://www.google.co.kr/search?num=3000&start=100&ie=UTF-8&q=site:https://cdn-learn.adafruit.com/downloads/pdf/'


# num = 3000 전체 검색 갯수

# start =100 :  페이지당 검색결과 100 으로 설정 했을시

#               두번째 페이지 시작

#       =200 : 세번째 페이지


resp = requests.get(address)

resp.raise_for_status()

 

resp.encoding='euc-kr'

html = resp.text


#html = bs.prettify()


bs = bs4.BeautifulSoup(html, 'html.parser')


for tags in bs.select('div.jfp3ef a'):  #div.jfp3ef : 얖 테그의 클래스 속성

    target= tags.get('href')

    target= target.lstrip('/url?q=')

    target= target.split('&')[0]

    target= target.replace("%2520", " ")


    print(target)




from bs4 import BeautifulSoup

import requests


address = 'https://www.google.co.kr/search?num=3000&start=100&ie=UTF-8&q=site:https://cdn-learn.adafruit.com/downloads/pdf/'


resp = requests.get(address)

resp.raise_for_status()

 

resp.encoding='euc-kr'

html = resp.text


#html = bs.prettify()


bs = bs4.BeautifulSoup(html, 'html.parser')


for tags in bs.select('div.jfp3ef a'): 

    target= tags.get('href')

    target= target.lstrip('/url?q=')

    target= target.split('&')[0]

    target= target.replace("%2520", " ")

    

    if '%' in target:

        target = target.split('%')  # 검색화일 뒤에 붙는 것 제거 %3Ftimestamp%3D1561616644

        target = target[0]         # 검색화일 뒤에 붙는 것 제거 %3Ftimestamp%3D1563452703

        

    print(target)

'[ 기타 활동 ] > 파이썬' 카테고리의 다른 글

python에서 tinyDB사용하기  (0) 2018.07.13
MYSQL사용하기  (0) 2018.07.13
정규 표현식 (Regular Expression)  (0) 2018.05.16
파이썬에서 엑셀 사용하기  (0) 2018.05.16
Selenium 사용하기  (0) 2018.05.16