
3 minute read
ɉȺɊɋɂɇȽ ɋȺɃɌɈȼ. ɉȺɊɋȿɊ ɇȺ əɁɕɄȿ PYTHON Ʉɨɧɨɧɨɜ ȼɥɚɞɢɫɥɚɜ Ⱥɧɞɪɟɟɜɢɱ
from internauka292020
by bortnikova
ɂɇɎɈɊɆȺɐɂɈɇɇɕȿ ɌȿɏɇɈɅɈȽɂɂ
ɉȺɊɋɂɇȽ ɋȺɃɌɈȼ. ɉȺɊɋȿɊ ɇȺ əɁɕɄȿ PYTHON
Advertisement
Ʉɨɧɨɧɨɜ ȼɥɚɞɢɫɥɚɜ Ⱥɧɞɪɟɟɜɢɱ
ɫɬɭɞɟɧɬ, ɋɟɜɟɪɧɵɣ (Ⱥɪɤɬɢɱɟɫɤɢɣ) ɮɟɞɟɪɚɥɶɧɵɣ ɭɧɢɜɟɪɫɢɬɟɬ ɢɦɟɧɢ Ɇ. ȼ. Ʌɨɦɨɧɨɫɨɜɚ, ɊɎ, ɝ. Ⱥɪɯɚɧɝɟɥɶɫɤ
ȺɇɇɈɌȺɐɂə
ȼ ɞɚɧɧɨɣ ɫɬɚɬɶɟ ɩɪɢɜɨɞɢɬɫɹ ɪɚɡɛɨɪ ɢ ɨɩɢɫɚɧɢɟ ɩɪɨɝɪɚɦɦɧɨɝɨ ɤɨɞɚ, ɜɵɩɨɥɧɹɸɳɟɝɨ ɩɚɪɫɢɧɝ ɫɚɣɬɨɜ ɫ ɜɵɜɨɞɨɦ ɢɧɮɨɪɦɚɰɢɢ ɜ ɭɞɨɛɧɨɦ ɞɥɹ ɩɨɥɶɡɨɜɚɬɟɥɹ ɜɢɞɟ. ɉɪɨɝɪɚɦɦɧɵɣ ɤɨɞ ɩɪɟɞɫɬɚɜɥɟɧ ɧɚ ɹɡɵɤɟ ɩɪɨɝɪɚɦɦɢɪɨɜɚɧɢɹ Python.
Ʉɥɸɱɟɜɵɟ ɫɥɨɜɚ: ɩɚɪɫɢɧɝ, python.
ɇɨɜɨɫɬɧɵɟ ɫɚɣɬɵ ɹɜɥɹɸɬɫɹ ɯɨɪɨɲɢɦ ɢɫɬɨɱɧɢweb-ɪɟɫɭɪɫɚɦ. ȼ ɷɬɨɣ ɛɢɛɥɢɨɬɟɤɟ ɫɨɛɪɚɧɨ ɦɧɨɠɟɤɨɦ ɞɚɧɧɵɯ ɞɥɹ ɨɛɪɚɛɨɬɤɢ ɦɨɞɟɥɹɦɢ ɦɚɲɢɧɧɨɝɨ ɫɬɜɨ ɦɟɬɨɞɨɜ, ɫ ɩɨɦɨɳɶɸ ɤɨɬɨɪɵɯ ɡɧɚɱɢɬɟɥɶɧɨ ɨɛɭɱɟɧɢɹ. ɉɪɢ ɷɬɨɦ, ɛɨɥɶɲɢɧɫɬɜɨ ɧɨɜɨɫɬɧɵɯ webɭɩɪɨɳɚɟɬɫɹ ɩɪɨɰɟɫɫ ɫɨɡɞɚɧɢɹ ɡɚɩɪɨɫɨɜ ɢ ɪɚɡɛɨɪɚ ɪɟɫɭɪɫɨɜ ɩɟɪɟɝɪɭɠɟɧɨ ɪɚɡɥɢɱɧɨɣ «ɥɢɲɧɟɣ» ɢɧɮɨɪɨɬɜɟɬɨɜ ɨɬ ɫɟɪɜɟɪɚ. ɦɚɰɢɟɣ, ɧɟ ɨɬɧɨɫɹɳɟɣɫɹ ɤ ɫɭɬɢ ɫɬɚɬɟɣ, ɛɭɞɶ ɬɨ Ȼɢɛɥɢɨɬɟɤɚ BeautifulSoup ɢɫɩɨɥɶɡɭɟɬɫɹ ɞɥɹ ɚɧɚɧɚɜɹɡɱɢɜɚɹ ɪɟɤɥɚɦɚ, ɜɫɩɥɵɜɚɸɳɢɟ ɨɤɧɚ ɫɨ ɫɥɭɠɟɛɥɢɡɚ ɞɨɤɭɦɟɧɬɨɜ HTML ɢ XML. Ɉɧɚ ɫɨɡɞɚɟɬ ɞɟɪɟɜɨ ɧɨɣ ɢɧɮɨɪɦɚɰɢɟɣ ɢ ɬ.ɞ. Ɉɱɢɫɬɤɚ ɫɬɚɬɟɣ ɩɪɢ ɜɵɝɪɭɡɫɢɧɬɚɤɫɢɱɟɫɤɨɝɨ ɚɧɚɥɢɡɚ ɞɥɹ ɩɨɥɭɱɟɧɧɵɯ ɫɬɪɚɧɢɰ ɤɟ ɫ web-ɪɟɫɭɪɫɨɜ ɨɬ «ɲɭɦɚ» ɩɨɦɨɝɥɚ ɛɵ ɫɧɢɡɢɬɶ (DOM-ɞɟɪɟɜɨ), ɤɨɬɨɪɨɟ ɦɨɠɧɨ ɢɫɩɨɥɶɡɨɜɚɬɶ ɞɥɹ ɤɨɥɢɱɟɫɬɜɨ ɨɲɢɛɨɤ ɨɛɪɚɛɨɬɤɢ ɞɚɧɧɵɯ. ɢɡɜɥɟɱɟɧɢɹ ɞɚɧɧɵɯ ɢɡ HTML ɜ ɭɞɨɛɧɵɟ ɞɥɹ ɨɛɪɚɉɚɪɫɢɧɝ ɫɚɣɬɨɜ –ɷɬɨ ɧɨɜɵɣ ɦɟɬɨɞ ɜɜɨɞɚ ɞɚɧɛɨɬɤɢ ɧɚ ɹɡɵɤɟ Python ɤɨɧɫɬɪɭɤɰɢɢ (ɫɥɨɜɚɪɢ, ɫɩɢɫɧɵɯ, ɤɨɬɨɪɵɣ ɧɟ ɬɪɟɛɭɟɬ ɩɨɜɬɨɪɧɨɝɨ ɜɜɨɞɚ ɢɥɢ ɤɨɤɢ, ɫɬɪɨɤɢ), ɱɬɨ ɩɨɥɟɡɧɨ ɞɥɹ ɨɱɢɫɬɤɢ ɜɟɛ-ɫɬɪɚɧɢɰ ɨɬ ɩɢɩɚɫɬɢɧɝɚ. Ɍɚɤɨɝɨ ɪɨɞɚ ɩɪɨɝɪɚɦɦɧɨɟ ɨɛɟɫɩɟɱɟɧɢɟ ɧɟɧɭɠɧɨɣ ɢɧɮɨɪɦɚɰɢɢ. ɢɳɟɬ ɢɧɮɨɪɦɚɰɢɸ ɩɨɞ ɤɨɧɬɪɨɥɟɦ ɩɨɥɶɡɨɜɚɬɟɥɹ ɢɥɢ Ɉɫɧɨɜɧɨɣ ɚɥɝɨɪɢɬɦ ɩɪɨɝɪɚɦɦɵ ɛɭɞɟɬ ɡɚɤɥɸɚɜɬɨɦɚɬɢɱɟɫɤɢ, ɜɵɛɢɪɚɹ ɧɨɜɵɟ ɢɥɢ ɨɛɧɨɜɥɟɧɧɵɟ ɱɚɬɶɫɹ ɜ ɫɥɟɞɭɸɳɟɦ, ɩɪɢ ɡɚɩɭɫɤɟ, ɫɤɪɢɩɬ ɡɚɩɪɨɫɢɬ ɞɚɧɧɵɟ ɢ ɫɨɯɪɚɧɹɹ ɢɯ ɜ ɬɚɤɨɦ ɜɢɞɟ, ɱɬɨɛɵ ɭ ɩɨɥɶɡɨURL ɚɞɪɟɫ, ɝɞɟ ɪɚɫɩɨɥɚɝɚɟɬɫɹ ɫɬɚɬɶɹ, ɫ ɩɨɦɨɳɶɸ ɜɚɬɟɥɹ ɛɵɥ ɤ ɧɢɦ ɛɵɫɬɪɵɣ ɞɨɫɬɭɩ. ɡɚɩɪɨɫɚ GET ɩɨɥɭɱɢɦ HTML-ɫɬɪɚɧɢɰɭ. Ɂɚɬɟɦ, ɫ
Ⱦɥɹ ɪɟɲɟɧɢɹ ɞɚɧɧɨɣ ɩɪɨɛɥɟɦɵ ɫɨɡɞɚɞɢɦ ɢɧɩɨɦɨɳɶɸ ɫɨɡɞɚɧɧɨɝɨ ɨɛɴɟɤɬɚ BeautifulSoup, ɨɱɢɫɬɪɭɦɟɧɬ, ɫ ɩɨɦɨɳɶɸ ɤɨɬɨɪɨɝɨ ɦɨɠɧɨ ɩɨɥɭɱɚɬɶ ɫɬɢɦ ɫɬɚɬɶɸ ɨɬ ɥɢɲɧɟɣ ɢɧɮɨɪɦɚɰɢɢ. ɬɟɤɫɬ ɢɡ ɫɬɚɬɟɣ ɫ ɦɢɧɢɦɚɥɶɧɨɣ ɧɟɧɭɠɧɨɣ ɢɧɮɨɪɦɚɉɟɪɟɞ ɧɚɩɢɫɚɧɢɟɦ ɤɨɞɚ, ɧɟɨɛɯɨɞɢɦɨ ɩɪɨɚɧɚɥɢɰɢɟɣ. ɡɢɪɨɜɚɬɶ ɨɫɧɨɜɧɭɸ ɫɬɪɭɤɬɭɪɭ ɫɬɚɬɟɣ. ɉɨɱɬɢ ɜ ɤɚɠɉɪɨɝɪɚɦɦɚ ɛɭɞɟɬ ɩɢɫɚɬɶɫɹ ɧɚ ɹɡɵɤɟ Python, ɬɚɤ ɞɨɣ ɫɬɚɬɶɟ ɧɭɠɧɵɣ ɬɟɤɫɬ ɪɚɫɩɨɥɚɝɚɟɬɫɹ ɜ ɬɟɝɚɯ ɤɚɤ ɷɬɨɬ ɹɡɵɤ ɹɜɥɹɟɬɫɹ ɦɨɞɭɥɶɧɵɦ, ɱɬɨ ɩɨɡɜɨɥɢɬ ɛɟɡ <p><p>. ɂɦɟɧɧɨ ɷɬɢ ɬɟɝɢ ɛɭɞɭɬ ɹɜɥɹɬɶɫɹ ɤɪɢɬɟɪɢɟɦ ɨɫɨɛɵɯ ɬɪɭɞɧɨɫɬɟɣ ɪɟɲɢɬɶ ɩɨɫɬɚɜɥɟɧɧɭɸ ɡɚɞɚɱɭ. ɨɬɛɨɪɚ ɢɧɮɨɪɦɚɰɢɢ.
Ⱦɥɹ ɧɚɱɚɥɚ ɪɚɛɨɬɵ ɧɚɦ ɩɨɬɪɟɛɭɸɬɫɹ ɞɜɟ ɛɢɛɉɪɢ ɪɟɲɟɧɢɢ ɞɚɧɧɨɣ ɡɚɞɚɱɢ ɦɨɠɧɨ ɢɫɩɨɥɶɡɨɥɢɨɬɟɤɢ: ɜɚɬɶ ɤɚɤ ɤɥɚɫɫɵ, ɬɚɤ ɢ ɨɛɵɱɧɵɟ ɮɭɧɤɰɢɢ. ɇɚɩɢɲɟɦ request; ɤɨɞ ɛɟɡ ɩɨɦɨɳɢ ɤɥɚɫɫɨɜ. BeautifulSoup. ɋɧɚɱɚɥɚ ɢɦɩɨɪɬɢɪɭɟɦ ɧɟɨɛɯɨɞɢɦɵɟ ɛɢɛɥɢɨɬɟ
Ȼɢɛɥɢɨɬɟɤɚ requests ɩɨ ɮɚɤɬɭ ɹɜɥɹɟɬɫɹ ɫɬɚɧɞɚɪɤɢ, ɜ ɜɟɪɯɧɟɣ ɱɚɫɬɢ ɤɨɞɚ ɭɤɚɡɵɜɚɟɦ ɜɫɟ ɧɟɨɛɯɨɞɢɬɨɦ ɩɪɢ ɪɚɛɨɬɟ ɫ ɫɨɫɬɚɜɥɟɧɢɟɦ HTTP-ɡɚɩɪɨɫɨɜ ɤ ɦɵɟ ɩɟɪɟɦɟɧɧɵɟ: url = "" headers = {'user_agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0; Touch)', 'accept': '*/*'} path = "" content_tags = ["p"] wrap = 80
Ɉɩɢɫɚɧɢɟ ɨɛɴɟɤɬɨɜ: url - URL –ɚɞɪɟɫ ɨɛɪɚɛɚɬɵɜɚɟɦɨɣ ɫɬɚɬɶɢ; headers –ɧɟɨɛɯɨɞɢɦɵɟ ɫɬɚɧɞɚɪɬɧɵɟ ɩɚɪɚɦɟɬɪɵ; path –ɩɭɬɶ ɞɥɹ ɫɨɯɪɚɧɟɧɢɹ ɨɛɪɚɛɨɬɚɧɧɨɝɨ ɮɚɣɥɚ;
content_tag – html-ɬɟɝɢɨɛɪɚɛɨɬɤɢ; wrap –ɤɨɥɢɱɟɫɬɜɨ ɫɢɦɜɨɥɨɜ ɜ ɫɬɪɨɤɟ ɨɱɢɳɟɧɧɨɝɨ ɬɟɤɫɬɚ.
Ⱦɚɥɟɟ ɧɚɩɢɲɟɦ ɧɟɛɨɥɶɲɨɣ ɦɟɬɨɞ, ɤɨɬɨɪɵɣ ɛɭɞɟɬ, ɜ ɫɥɭɱɚɟ ɭɞɚɱɧɨɝɨ ɪɟɡɭɥɶɬɚɬɚ ɪɚɡɪɟɲɚɬɶ ɩɪɨɜɟɞɟɧɢɟ ɨɫɧɨɜɧɨɝɨ ɚɥɝɨɪɢɬɦɚ:
def get_html(url, params=None): # return html code of url r = requests.get(url, headers=HEADERS, params=params) return r
Ⱦɚɥɟɟ ɧɚɩɢɲɟɦ ɨɫɧɨɜɧɭɸ ɮɭɧɤɰɢɸ ɩɪɨɝɪɚɦɦɵ, ɤɨɬɨɪɚɹ ɛɭɞɟɬ ɩɨɥɭɱɚɬɶ ɨɬɜɟɬ ɨɬ web-ɪɟɫɭɪɫɚ ɢ ɩɪɢ ɩɨɦɨɳɢ BeautifulSoup ɧɚɦ ɞɚɧɧɵɟ: ɨɬɮɢɥɶɬɪɨɜɵɜɚɬɶ ɧɭɠɧɵɟ
def get_content(url): # return clear text of article r = requests.get(url).text soup = BeautifulSoup(r, 'html.parser') content = soup.find_all(content_tags) # Getting the entire tag content, described in self.content_tags. wrapped_text = "" for p in content: # Skipping empty tags. if p.text != "": # Formatting links into view: [link] links = p.find_all('a') if links != "": for link in links: p.a.replace_with(str("[" + link['href'] + "]")) # Text formatting in tags according to ɫolumn width. wrapped_text += "".join(textwrap.fill(p.text, wrap)) + "\n\n" save_text(wrapped_text)
ȼ ɤɚɱɟɫɬɜɟ ɪɟɡɭɥɶɬɚɬɚ ɛɭɞɟɦ ɫɨɯɪɚɧɹɬɶ ɬɟɤɫɬ ɜ ɮɚɣɥ, ɫ ɧɟɤɨɬɨɪɵɦɢ ɢɡɦɟɧɟɧɢɹɦɢ.
Ⱦɨɩɢɲɟɦ ɮɭɧɤɰɢɸ ɡɚɩɢɫɢ «ɱɢɫɬɨɝɨ» ɬɟɤɫɬɚ ɜ ɬɟɤɫɬɨɜɵɣ ɮɚɣɥ:
def save_text(text): file = open(PATH, 'w') file.write("\n---------ɇɨɜɚɹ ɫɬɚɬɶɹ---------\n\n" + text) file.close() print("Ɂɚɩɢɫɶ ɫɨɜɟɪɲɟɧɚ ɭɫɩɟɲɧɨ!") Ⱦɚɥɟɟ ɩɪɨɩɢɲɟɦ ɮɭɧɤɰɢɸ ɩɪɨɜɟɪɤɢ ɢ ɡɚɩɭɫɤɚ ɨɫɧɨɜɧɨɣ ɮɭɧɤɰɢɢ: def parse(url): # main func html = get_html(url) print(html.status_code) if html.status_code == 200: get_content(url) else: print("Error") ɉɪɨɩɢɲɟɦ ɡɚɩɭɫɤɚɸɳɢɣ ɦɟɯɚɧɢɡɦ: if__name__ == "__main__": print("ȼɜɟɞɢɬɟ URL ɚɞɪɟɫ:") URL = input() parse(URL)
ȼ ɞɚɥɶɧɟɣɲɟɦ ɞɚɧɧɵɣ ɫɤɪɢɩɬ ɦɨɠɧɨ ɭɫɨɜɟɪɲɟɧɫɬɜɨɜɚɬɶ ɬɚɤ, ɱɬɨɛɵ ɨɧ ɜ ɤɚɱɟɫɬɜɟ ɜɯɨɞɧɵɯ ɞɚɧɧɵɯ ɢɫɩɨɥɶɡɨɜɚɥ ɮɚɣɥ ɫɨ ɫɩɢɫɤɨɦ URL ɢ ɜɵɝɪɭɠɚɥ ɭɠɟ ɝɨɬɨɜɵɣ ɦɚɫɫɢɜ ɞɚɧɧɵɯ ɰɟɥɢɤɨɦ. ɇɟɫɦɨɬɪɹ ɧɚ ɬɨ, ɱɬɨ ɝɪɚɛɛɟɪɵ ɹɜɥɹɸɬɫɹ ɨɱɟɧɶ ɩɨɥɟɡɧɵɦ ɢɧɫɬɪɭɦɟɧɬɨɦ ɩɨ ɫɨɡɞɚɧɢɸ ɜɵɛɨɪɨɤ ɞɥɹ ɩɪɨɜɟɞɟɧɢɹ ɢɫɫɥɟɞɨɜɚɧɢɣ, ɦɚɲɢɧɧɚɹ ɜɵɝɪɭɡɤɚ ɞɚɧɧɵɯ ɫ web-ɪɟɫɭɪɫɨɜ ɩɪɢɜɨɞɢɬ ɤ ɡɧɚɱɢɬɟɥɶɧɨɦɭ ɩɨɜɵɲɟɧɢɸ ɫɟɬɟɜɨɣ ɧɚɝɪɭɡɤɢ ɧɚ ɫɚɣɬɵ ɢ ɞɨɫɬɚɜɥɹɟɬ ɩɪɨɛɥɟɦɵ ɜɥɚɞɟɥɶɰɚɦ ɪɟɫɭɪɫɨɜ. ɇɟɤɨɬɨɪɵɟ ɞɚɠɟ ɛɥɨɤɢɪɭɸɬ ip-ɚɞɪɟɫɚ, ɫ ɤɨɬɨɪɵɯ ɡɚɦɟɱɟɧɚ ɩɨɞɨɛɧɚɹ ɚɤɬɢɜɧɨɫɬɶ. ɉɨɷɬɨɦɭ ɩɪɢɡɵɜɚɸ ɩɨɥɶɡɨɜɚɬɶɫɹ ɝɪɚɛɛɟɪɚɦɢ ɜ ɩɪɟɞɟɥɚɯ ɪɚɡɭɦɧɨɝɨ ɢ ɧɟ ɩɟɪɟɝɪɭɠɚɬɶ ɭɞɚɥɟɧɧɵɟ web-ɪɟɫɭɪɫɵ, ɱɬɨɛɵ ɧɟ ɜɵɡɵɜɚɬɶ ɧɟɝɚɬɢɜɧɵɯ ɩɨɫɥɟɞɫɬɜɢɣ ɧɚ ɧɢɯ.
ɋɩɢɫɨɤ ɥɢɬɟɪɚɬɭɪɵ:
1. BeautifulSoup –ɩɚɪɫɢɧɝ HTML ɜ Python ɧɚ ɩɪɢɦɟɪɚɯ. URL: https://python-scripts.com/beautifulsoup-htmlparsing#html-parsing-example (ɞɚɬɚ ɨɛɪɚɳɟɧɢɹ 07.08.2020) 2. Ɋɚɡɧɨɜɢɞɧɨɫɬɢ ɩɚɪɫɢɧɝɚ URL: https://www.seonews.ru/glossary/parsing/ (ɞɚɬɚ ɨɛɪɚɳɟɧɢɹ 20.06.2020)