{"id":18,"date":"2022-08-23T14:46:53","date_gmt":"2022-08-23T06:46:53","guid":{"rendered":"https:\/\/mianka.xyz\/?p=16"},"modified":"2022-08-23T14:54:33","modified_gmt":"2022-08-23T06:54:33","slug":"pythonxpath","status":"publish","type":"post","link":"https:\/\/www.mianka.xyz\/?p=18","title":{"rendered":"Python\u722c\u866bxpath\u8be6\u89e3"},"content":{"rendered":"<h2>\u4e00\u3001xpath\u4ecb\u7ecd<\/h2>\n<p>xpath\u662f\u4e00\u95e8\u5728 XML \u6587\u6863\u4e2d\u67e5\u627e\u4fe1\u606f\u7684\u8bed\u8a00\u3002\u6700\u521d\u662f\u7528\u6765\u641c\u5bfb XML \u6587\u6863\u7684\uff0c\u4f46\u540c\u6837\u9002\u7528\u4e8e HTML \u6587\u6863\u7684\u641c\u7d22\u3002\u6240\u4ee5\u5728\u505a\u722c\u866b\u65f6\u5b8c\u5168\u53ef\u4ee5\u4f7f\u7528 XPath \u505a\u76f8\u5e94\u7684\u4fe1\u606f\u62bd\u53d6\u3002<\/p>\n<p><\/p>\n<h2>\u4e8c\u3001\u5b89\u88c5lxml<\/h2>\n<p>lxml\u662fPython\u7684\u4e00\u4e2a\u7b2c\u4e09\u65b9\u89e3\u6790\u5e93\uff0c\u652f\u6301HTML\u548cXML\u89e3\u6790\uff0c\u800c\u4e14\u6548\u7387\u975e\u5e38\u9ad8\uff0c\u5f25\u8865\u4e86Python\u81ea\u5e26\u7684xml\u6807\u51c6\u5e93\u5728XML\u89e3\u6790\u65b9\u9762\u7684\u4e0d\u8db3\u3002<\/p>\n<p><\/p>\n<p>\u7531\u4e8e\u662f\u7b2c\u4e09\u65b9\u5e93\uff0c\u6240\u4ee5\u5728\u4f7f\u7528 lxml \u4e4b\u524d\u9700\u8981\u5148\u5b89\u88c5\uff1apip install lxml<\/p>\n<p><\/p>\n<h2>\u4e09\u3001xpath\u89e3\u6790\u539f\u7406<\/h2>\n<p>1.\u5b9e\u4f8b\u5316\u4e00\u4e2aetree\u7684\u5bf9\u8c61\uff0c\u4e14\u9700\u8981\u5c06\u88ab\u89e3\u6790\u7684\u9875\u9762\u6e90\u7801\u6570\u636e\u52a0\u8f7d\u5230\u8be5\u5bf9\u8c61\u4e2d\u3002<\/p>\n<p><\/p>\n<p>2.\u8c03\u7528etree\u5bf9\u8c61\u4e2d\u7684xpath\u65b9\u6cd5\u7ed3\u5408\u7740xpath\u8868\u8fbe\u5f0f\u5b9e\u73b0\u6807\u7b7e\u7684\u5b9a\u4f4d\u548c\u5185\u5bb9\u7684\u6355\u83b7\u3002<\/p>\n<p><\/p>\n<h2>\u56db\u3001\u5b9e\u4f8b\u5316\u4e00\u4e2aetree\u5bf9\u8c61<\/h2>\n<p>&nbsp;1.\u5c06\u672c\u5730\u7684html\u6587\u6863\u4e2d\u7684\u6e90\u7801\u6570\u636e\u52a0\u8f7d\u5230etree\u5bf9\u8c61\u4e2d\uff1a<\/p>\n<pre class=\"prism-highlight prism-language-markup\">etree.parse(filePath)<\/pre>\n<p>&nbsp;2.\u53ef\u4ee5\u5c06\u4ece\u4e92\u8054\u7f51\u4e0a\u83b7\u53d6\u7684\u6e90\u7801\u6570\u636e\u52a0\u8f7d\u5230\u8be5\u5bf9\u8c61\u4e2d<\/p>\n<pre class=\"prism-highlight prism-language-markup\">etree.HTML(&#39;page_text&#39;)<\/pre>\n<p><\/p>\n<p>3.xpath(&#39;xpath\u8868\u8fbe\u5f0f&#39;)<\/p>\n<p><\/p>\n<h2>\u4e94\u3001xpath\u8868\u8fbe\u5f0f<\/h2>\n<p>\/:\u8868\u793a\u7684\u662f\u4ece\u6839\u8282\u70b9\u5f00\u59cb\u5b9a\u4f4d\u3002\u8868\u793a\u7684\u662f\u4e00\u4e2a\u5c42\u7ea7\u3002<\/p>\n<p>&nbsp;\/\/:\u8868\u793a\u7684\u662f\u591a\u4e2a\u5c42\u7ea7\u3002\u53ef\u4ee5\u8868\u793a\u4ece\u4efb\u610f\u4f4d\u7f6e\u5f00\u59cb\u5b9a\u4f4d\u3002<\/p>\n<p>&nbsp;\u5c5e\u6027\u5b9a\u4f4d\uff1a\/\/div[@class=&#39;zhang&#39;] tag[@attrName=&quot;attrValue&quot;]<\/p>\n<p>&nbsp;\u7d22\u5f15\u5b9a\u4f4d\uff1a\/\/div[@class=&quot;zhang&quot;]\/p[3] \u7d22\u5f15\u662f\u4ece1\u5f00\u59cb\u7684\u3002<\/p>\n<p>&nbsp; \u53d6\u6587\u672c\uff1a\/text() \u83b7\u53d6\u7684\u662f\u6807\u7b7e\u4e2d\u76f4\u7cfb\u7684\u6587\u672c\u5185\u5bb9\u3001\/\/text() \u6807\u7b7e\u4e2d\u975e\u76f4\u7cfb\u7684\u6587\u672c\u5185\u5bb9\uff08\u6240\u6709\u7684\u6587\u672c\u5185\u5bb9\uff09<\/p>\n<p>&nbsp; \u53d6\u5c5e\u6027\uff1a \/@attrName&nbsp; &nbsp; &nbsp;==&gt;img\/src<\/p>\n<p>\u516d\u3001\u722c\u866b\u5b9e\u6218<\/p>\n<p>\u722c\u53d6\u7f51\u7ad9\u4fe1\u606f<\/p>\n<p><\/p>\n<pre class=\"prism-highlight prism-language-python\">import&nbsp;requests\nfrom&nbsp;lxml&nbsp;import&nbsp;etree\n&nbsp;\n#&nbsp;\u4f7f\u7528xpath&nbsp;\u722c\u53d6\u4e8c\u624b\u623f\u4fe1\u606f\n&nbsp;\nheaders&nbsp;=&nbsp;{\n&nbsp;&nbsp;&nbsp;&nbsp;&#39;User-Agent&#39;&nbsp;:&#39;Mozilla\/5.0&nbsp;(Windows&nbsp;NT&nbsp;10.0;&nbsp;WOW64)&nbsp;AppleWebKit\/537.36&nbsp;(KHTML,&nbsp;like&nbsp;Gecko)&nbsp;Chrome\/94.0.4606.81&nbsp;Safari\/537.36&#39;\n}\nurl&nbsp;=&nbsp;&#39;https:\/\/bj.58.com\/ershoufang\/{0}\/?PGTID=0d30000c-0000-1bdc-b1e5-d9efea8142e9&amp;ClickID=1&#39;\nfp&nbsp;=&nbsp;open(&#39;58\u4e8c\u624b\u623f.txt&#39;,&#39;w&#39;,encoding=&#39;utf-8&#39;)\nfor&nbsp;page&nbsp;in&nbsp;range(1,6):\n&nbsp;&nbsp;&nbsp;&nbsp;page&nbsp;=&nbsp;str(page)\n&nbsp;&nbsp;&nbsp;&nbsp;new_url&nbsp;=&nbsp;url.format(page)\n&nbsp;&nbsp;&nbsp;&nbsp;page_text&nbsp;=&nbsp;requests.get(url=new_url,headers=headers).text\n&nbsp;&nbsp;&nbsp;&nbsp;tree&nbsp;=&nbsp;etree.HTML(page_text)\n&nbsp;&nbsp;&nbsp;&nbsp;div_list&nbsp;=&nbsp;tree.xpath(&#39;\/\/section[@class=&quot;list&quot;]\/div&#39;)\n&nbsp;&nbsp;&nbsp;&nbsp;for&nbsp;div&nbsp;in&nbsp;div_list:\n&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;title&nbsp;=&nbsp;div.xpath(&#39;.\/a\/\/h3\/@title&#39;)[0]\n&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;roomsAndHalls&nbsp;=&nbsp;div.xpath(&#39;.\/a\/\/div[@class=&quot;property-content-info&quot;]\/\/span\/text()&#39;)\n&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;address&nbsp;=&nbsp;div.xpath(&#39;.\/a\/\/div[@class=&quot;property-content-info&nbsp;property-content-info-comm&quot;]\/\/span\/text()&#39;)\n&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;house_year&nbsp;=&nbsp;div.xpath(&#39;.\/a\/\/span[@class=&quot;property-content-info-tag&quot;]\/text()&#39;)\n&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;total_price&nbsp;=&nbsp;div.xpath(&#39;.\/a\/\/p[@class=&quot;property-price-total&quot;]\/\/span\/text()&#39;)\n&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;total_area&nbsp;=&nbsp;div.xpath(&#39;.\/a\/\/p[@class=&quot;property-price-average&quot;]\/\/text()&#39;)\n&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;roomsAndHalls&nbsp;=&nbsp;&#39;&#39;.join(roomsAndHalls)\n&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;address&nbsp;=&nbsp;&#39;&#39;.join(address)\n&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;house_year&nbsp;=&nbsp;&#39;&#39;.join(house_year)\n&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;total_price&nbsp;=&nbsp;&#39;&#39;.join(total_price)\n&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;total_area&nbsp;=&nbsp;&#39;&#39;.join(total_area)\n&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;#&nbsp;print(title)\n&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;#&nbsp;print(&#39;&#39;.join(roomsAndHalls))\n&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;#&nbsp;print(&#39;&#39;.join(address))\n&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;#&nbsp;print(&#39;&#39;.join(house_year))\n&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;print(title+&#39;======&gt;&#39;+roomsAndHalls+&#39;======&gt;&#39;+address+&#39;======&gt;&#39;+house_year+&#39;======&gt;&#39;+total_price+&#39;======&gt;&#39;+total_area)\n&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;fp.write(title+&#39;======&gt;&#39;+roomsAndHalls+&#39;======&gt;&#39;+address+&#39;======&gt;&#39;+house_year+&#39;======&gt;&#39;+total_price+&#39;======&gt;&#39;+total_area+&#39;\\n&#39;)<\/pre>\n<p>\u5176\u5b9expath\u5f88\u7b80\u5355\uff0c\u53ea\u8981\u628a\u8868\u8fbe\u5f0f\u7406\u89e3\u5bf9\u5c31\u6ca1\u5565\u5927\u95ee\u9898\u3002<\/p>\n<p><\/p>\n","protected":false},"excerpt":{"rendered":"<h2>\u4e00\u3001xpath\u4ecb\u7ecd<\/h2>\n<p>xpath\u662f\u4e00\u95e8\u5728 XML \u6587\u6863\u4e2d\u67e5\u627e\u4fe1\u606f\u7684\u8bed\u8a00\u3002\u6700\u521d\u662f\u7528\u6765\u641c\u5bfb XML \u6587\u6863\u7684\uff0c\u4f46\u540c\u6837\u9002\u7528\u4e8e HTML \u6587\u6863\u7684\u641c\u7d22\u3002\u6240\u4ee5\u5728\u505a\u722c\u866b\u65f6\u5b8c\u5168\u53ef\u4ee5\u4f7f\u7528 XPath \u505a\u76f8\u5e94\u7684\u4fe1\u606f\u62bd\u53d6\u3002<\/p>\n<p><\/p>\n<h2>\u4e8c\u3001\u5b89\u88c5lxml<\/h2>\n<p>lxml\u662fPython\u7684\u4e00\u4e2a\u7b2c\u4e09\u65b9\u89e3\u6790\u5e93\uff0c\u652f\u6301HTML\u548cXML\u89e3\u6790\uff0c\u800c\u4e14\u6548\u7387\u975e\u5e38\u9ad8\uff0c\u5f25\u8865\u4e86Python\u81ea\u5e26\u7684xml\u6807\u51c6\u5e93\u5728XML\u89e3\u6790\u65b9\u9762\u7684\u4e0d\u8db3\u3002<\/p>\n<p><\/p>\n<p>\u7531\u4e8e\u662f\u7b2c\u4e09\u65b9\u5e93\uff0c\u6240\u4ee5\u5728\u4f7f\u7528 lxml \u4e4b\u524d\u9700\u8981\u5148\u5b89\u88c5\uff1apip install lxml<\/p>\n<p><!--autointro--><\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[3],"tags":[15],"class_list":["post-18","post","type-post","status-publish","format-standard","hentry","category-pythonbiji","tag-python"],"_links":{"self":[{"href":"https:\/\/www.mianka.xyz\/index.php?rest_route=\/wp\/v2\/posts\/18","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.mianka.xyz\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.mianka.xyz\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.mianka.xyz\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/www.mianka.xyz\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=18"}],"version-history":[{"count":0,"href":"https:\/\/www.mianka.xyz\/index.php?rest_route=\/wp\/v2\/posts\/18\/revisions"}],"wp:attachment":[{"href":"https:\/\/www.mianka.xyz\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=18"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.mianka.xyz\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=18"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.mianka.xyz\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=18"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}