유니시티황 2018. 6. 28. 03:01

from bs4 import BeautifulSoup

import requests

import urllib.parse


address = 'https://www.google.co.kr/search?num=100&ie=UTF-8&q=site:koreasw.org/wp-content/+아두이노+pdf'


resp = requests.get(address)

resp.raise_for_status()

resp.encoding='euc-kr'


html = resp.text


#html = bs.prettify()


bs = bs4.BeautifulSoup(html, 'html.parser')


for tags in bs.select('div.jfp3ef a'): 

    target= tags.get('href')

    target= target.lstrip('/url?q=')

    target= target.split('&')[0]

    target= target.replace("%2520", " ")

    

    target = urllib.parse.unquote(target) # 한글 화일명일 때

    target = urllib.parse.unquote(target) # 한글 화일명일 때


    print(target)




from bs4 import BeautifulSoup

import requests



address = 'https://www.google.co.kr/search?num=100&ie=UTF-8&q=site:http://www.unicitypartner.cz/pdf/ pdf'



source = requests.get(address)

html = source.text


soup = BeautifulSoup(html,'lxml')


for tag in soup.select('h3[class=r]'):

    target = tag.a['href']

    target= target.lstrip('/url?q=')

    target= target.split('&')[0]

    target= target.replace("%2520", " ")

    print(target)




from bs4 import BeautifulSoup

import requests


address = 'https://www.google.co.kr/search?num=100&ie=UTF-8&q=site:www.asamanthinketh.net/files/ pdf'


resp = requests.get(address)

resp.raise_for_status()

 

resp.encoding='euc-kr'

html = resp.text


#html = bs.prettify()


bs = bs4.BeautifulSoup(html, 'html.parser')


for tags in bs.select('div.jfp3ef a'): 

    target= tags.get('href')

    target= target.lstrip('/url?q=')

    target= target.split('&')[0]

    target= target.replace("%2520", " ")

    print(target)





from bs4 import BeautifulSoup

import requests


address = 'https://www.google.co.kr/search?num=3000&start=100&ie=UTF-8&q=site:https://cdn-learn.adafruit.com/downloads/pdf/'


# num = 3000 전체 검색 갯수

# start =100 :  페이지당 검색결과 100 으로 설정 했을시

#               두번째 페이지 시작

#       =200 : 세번째 페이지


resp = requests.get(address)

resp.raise_for_status()

 

resp.encoding='euc-kr'

html = resp.text


#html = bs.prettify()


bs = bs4.BeautifulSoup(html, 'html.parser')


for tags in bs.select('div.jfp3ef a'):  #div.jfp3ef : 얖 테그의 클래스 속성

    target= tags.get('href')

    target= target.lstrip('/url?q=')

    target= target.split('&')[0]

    target= target.replace("%2520", " ")


    print(target)




from bs4 import BeautifulSoup

import requests


address = 'https://www.google.co.kr/search?num=3000&start=100&ie=UTF-8&q=site:https://cdn-learn.adafruit.com/downloads/pdf/'


resp = requests.get(address)

resp.raise_for_status()

 

resp.encoding='euc-kr'

html = resp.text


#html = bs.prettify()


bs = bs4.BeautifulSoup(html, 'html.parser')


for tags in bs.select('div.jfp3ef a'): 

    target= tags.get('href')

    target= target.lstrip('/url?q=')

    target= target.split('&')[0]

    target= target.replace("%2520", " ")

    

    if '%' in target:

        target = target.split('%')  # 검색화일 뒤에 붙는 것 제거 %3Ftimestamp%3D1561616644

        target = target[0]         # 검색화일 뒤에 붙는 것 제거 %3Ftimestamp%3D1563452703

        

    print(target)