import urllib.request
response = urllib.request.urlopen('https://kdding.github.io/researches.html')
type(response)

http.client.HTTPResponse

response.getcode()

200

response.geturl()

'https://kdding.github.io/researches.html'

print(response.info())

Connection: close
Content-Length: 23449
Server: GitHub.com
Content-Type: text/html; charset=utf-8
permissions-policy: interest-cohort=()
Last-Modified: Wed, 18 Sep 2024 09:31:34 GMT
Access-Control-Allow-Origin: *
ETag: "66ea9df6-5b99"
expires: Mon, 21 Oct 2024 06:17:18 GMT
Cache-Control: max-age=600
x-proxy-cache: MISS
X-GitHub-Request-Id: A59E:665AC:AD76A1:B9DD68:6715EF95
Accept-Ranges: bytes
Age: 287
Date: Mon, 21 Oct 2024 06:12:05 GMT
Via: 1.1 varnish
X-Served-By: cache-qpg120109-QPG
X-Cache: HIT
X-Cache-Hits: 0
X-Timer: S1729491126.713372,VS0,VE1
Vary: Accept-Encoding
X-Fastly-Request-ID: af8d8924491f6de54bedc2c924603b6ca97a3286

import urllib.request
response = urllib.request.urlopen('https://kdding.github.io/researches.html')
print(type(response))
response.read()

<class 'http.client.HTTPResponse'>

b'<!DOCTYPE html>\n<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>\n\n<meta charset="utf-8">\n<meta name="generator" content="quarto-1.4.553">\n\n<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">\n\n\n<title>Pingjian Ding - Researches</title>\n<style>\ncode{white-space: pre-wrap;}\nspan.smallcaps{font-variant: small-caps;}\ndiv.columns{display: flex; gap: min(4vw, 1.5em);}\ndiv.column{flex: auto; overflow-x: auto;}\ndiv.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}\nul.task-list{list-style: none;}\nul.task-list li input[type="checkbox"] {\n  width: 0.8em;\n  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ \n  vertical-align: middle;\n}\n</style>\n\n\n<script src="site_libs/quarto-nav/quarto-nav.js"></script>\n<script src="site_libs/quarto-nav/headroom.min.js"></script>\n<script src="site_libs/clipboard/clipboard.min.js"></script>\n<script src="site_libs/quarto-search/autocomplete.umd.js"></script>\n<script src="site_libs/quarto-search/fuse.min.js"></script>\n<script src="site_libs/quarto-search/quarto-search.js"></script>\n<meta name="quarto:offset" content="./">\n<script src="site_libs/quarto-html/quarto.js"></script>\n<script src="site_libs/quarto-html/popper.min.js"></script>\n<script src="site_libs/quarto-html/tippy.umd.min.js"></script>\n<script src="site_libs/quarto-html/anchor.min.js"></script>\n<link href="site_libs/quarto-html/tippy.css" rel="stylesheet">\n<link href="site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">\n<script src="site_libs/bootstrap/bootstrap.min.js"></script>\n<link href="site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">\n<link href="site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">\n<script id="quarto-search-options" type="application/json">{\n  "location": "navbar",\n  "copy-button": false,\n  "collapse-after": 3,\n  "panel-placement": "end",\n  "type": "overlay",\n  "limit": 50,\n  "keyboard-shortcut": [\n    "f",\n    "/",\n    "s"\n  ],\n  "show-item-context": false,\n  "language": {\n    "search-no-results-text": "No results",\n    "search-matching-documents-text": "matching documents",\n    "search-copy-link-title": "Copy link to search",\n    "search-hide-matches-text": "Hide additional matches",\n    "search-more-match-text": "more match in this document",\n    "search-more-matches-text": "more matches in this document",\n    "search-clear-button-title": "Clear",\n    "search-text-placeholder": "",\n    "search-detached-cancel-button-title": "Cancel",\n    "search-submit-button-title": "Submit",\n    "search-label": "Search"\n  }\n}</script>\n\n\n<link rel="stylesheet" href="styles.css">\n</head>\n\n<body class="nav-fixed">\n\n<div id="quarto-search-results"></div>\n  <header id="quarto-header" class="headroom fixed-top">\n    <nav class="navbar navbar-expand-lg " data-bs-theme="dark">\n      <div class="navbar-container container-fluid">\n      <div class="navbar-brand-container mx-auto">\n    <a class="navbar-brand" href="./index.html">\n    <span class="navbar-title">Pingjian Ding</span>\n    </a>\n  </div>\n            <div id="quarto-search" class="" title="Search"></div>\n          <button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbarCollapse" aria-controls="navbarCollapse" aria-expanded="false" aria-label="Toggle navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">\n  <span class="navbar-toggler-icon"></span>\n</button>\n          <div class="collapse navbar-collapse" id="navbarCollapse">\n            <ul class="navbar-nav navbar-nav-scroll me-auto">\n  <li class="nav-item">\n    <a class="nav-link" href="./index.html"> \n<span class="menu-text">Home</span></a>\n  </li>  \n  <li class="nav-item">\n    <a class="nav-link active" href="./researches.html" aria-current="page"> \n<span class="menu-text">Researches</span></a>\n  </li>  \n  <li class="nav-item">\n    <a class="nav-link" href="./publications.html"> \n<span class="menu-text">Publications</span></a>\n  </li>  \n  <li class="nav-item">\n    <a class="nav-link" href="./courses.html"> \n<span class="menu-text">Courses</span></a>\n  </li>  \n  <li class="nav-item">\n    <a class="nav-link" href="./blogs.html"> \n<span class="menu-text">Blogs</span></a>\n  </li>  \n  <li class="nav-item">\n    <a class="nav-link" href="./contact.html"> \n<span class="menu-text">Contact</span></a>\n  </li>  \n</ul>\n          </div> <!-- /navcollapse -->\n          <div class="quarto-navbar-tools">\n</div>\n      </div> <!-- /container-fluid -->\n    </nav>\n</header>\n<!-- content -->\n<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article page-navbar">\n<!-- sidebar -->\n<!-- margin-sidebar -->\n    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">\n        <nav id="TOC" role="doc-toc" class="toc-active">\n    <h2 id="toc-title">On this page</h2>\n   \n  <ul>\n  <li><a href="#machine-learning-frameworks-in-biomedical-science" id="toc-machine-learning-frameworks-in-biomedical-science" class="nav-link active" data-scroll-target="#machine-learning-frameworks-in-biomedical-science">Machine Learning Frameworks in Biomedical Science</a></li>\n  <li><a href="#statistical-genetics-for-causal-relationships" id="toc-statistical-genetics-for-causal-relationships" class="nav-link" data-scroll-target="#statistical-genetics-for-causal-relationships">Statistical Genetics for Causal Relationships</a></li>\n  <li><a href="#target-trial-emulation" id="toc-target-trial-emulation" class="nav-link" data-scroll-target="#target-trial-emulation">Target trial emulation</a></li>\n  </ul>\n</nav>\n    </div>\n<!-- main -->\n<main class="content" id="quarto-document-content">\n\n<header id="title-block-header" class="quarto-title-block default">\n<div class="quarto-title">\n<h1 class="title">Researches</h1>\n</div>\n\n\n\n<div class="quarto-title-meta">\n\n    \n  \n    \n  </div>\n  \n\n\n</header>\n\n\n<section id="machine-learning-frameworks-in-biomedical-science" class="level2">\n<h2 class="anchored" data-anchor-id="machine-learning-frameworks-in-biomedical-science">Machine Learning Frameworks in Biomedical Science</h2>\n<div class="columns">\n<div class="column" style="width:65%;">\n<p>We have been developing tensor factorization and deep learning methods to facilitate the analysis of epigenomic data and use integrative modeling approaches to study genomic transcriptional and epigenetic gene regulatory mechanisms underlying psychiatry diseases. We are developing new methods to utilize the abundant public relationship data to understand targets and specificity of drugs in treating Alzheimer\xe2\x80\x99s Disease.</p>\n</div><div class="column" style="width:35%;">\n<p><img src="assets/img/research-deep-learning.png" class="img-fluid"></p>\n</div>\n</div>\n</section>\n<section id="statistical-genetics-for-causal-relationships" class="level2">\n<h2 class="anchored" data-anchor-id="statistical-genetics-for-causal-relationships">Statistical Genetics for Causal Relationships</h2>\n<div class="columns">\n<div class="column" style="width:65%;">\n<p>We have been utilizing causal inference methods to facilitate the analysis of genotyping data and employing Mendelian randomization approaches to investigate the causal associations between Alzheimer\xe2\x80\x99s disease and various drugs, viruses, and other risk factors. Our investigations on aspirin and GLP-1R agonists suggest that they may have the potential to offer effective neuroprotective properties.</p>\n</div><div class="column" style="width:35%;">\n<p><img src="assets/img/statistic.png" class="img-fluid"></p>\n</div>\n</div>\n</section>\n<section id="target-trial-emulation" class="level2">\n<h2 class="anchored" data-anchor-id="target-trial-emulation">Target trial emulation</h2>\n<div class="columns">\n<div class="column" style="width:65%;">\n<p>We are developing deep learning-based methods to approximate the results of a randomized trial by directly comparing outcomes between individuals who received the treatment of interest and those who did not. Additionally, by analyzing electronic health records, we are identifying potential drug combinations for controlling blood pressure and exploring novel drugs for neurodegenerative diseases.</p>\n</div><div class="column" style="width:35%;">\n<p><img src="assets/img/research_EHR.png" class="img-fluid"></p>\n</div>\n</div>\n\n\n</section>\n\n</main> <!-- /main -->\n<script id="quarto-html-after-body" type="application/javascript">\nwindow.document.addEventListener("DOMContentLoaded", function (event) {\n  const toggleBodyColorMode = (bsSheetEl) => {\n    const mode = bsSheetEl.getAttribute("data-mode");\n    const bodyEl = window.document.querySelector("body");\n    if (mode === "dark") {\n      bodyEl.classList.add("quarto-dark");\n      bodyEl.classList.remove("quarto-light");\n    } else {\n      bodyEl.classList.add("quarto-light");\n      bodyEl.classList.remove("quarto-dark");\n    }\n  }\n  const toggleBodyColorPrimary = () => {\n    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");\n    if (bsSheetEl) {\n      toggleBodyColorMode(bsSheetEl);\n    }\n  }\n  toggleBodyColorPrimary();  \n  const icon = "\xee\xa7\x8b";\n  const anchorJS = new window.AnchorJS();\n  anchorJS.options = {\n    placement: \'right\',\n    icon: icon\n  };\n  anchorJS.add(\'.anchored\');\n  const isCodeAnnotation = (el) => {\n    for (const clz of el.classList) {\n      if (clz.startsWith(\'code-annotation-\')) {                     \n        return true;\n      }\n    }\n    return false;\n  }\n  const clipboard = new window.ClipboardJS(\'.code-copy-button\', {\n    text: function(trigger) {\n      const codeEl = trigger.previousElementSibling.cloneNode(true);\n      for (const childEl of codeEl.children) {\n        if (isCodeAnnotation(childEl)) {\n          childEl.remove();\n        }\n      }\n      return codeEl.innerText;\n    }\n  });\n  clipboard.on(\'success\', function(e) {\n    // button target\n    const button = e.trigger;\n    // don\'t keep focus\n    button.blur();\n    // flash "checked"\n    button.classList.add(\'code-copy-button-checked\');\n    var currentTitle = button.getAttribute("title");\n    button.setAttribute("title", "Copied!");\n    let tooltip;\n    if (window.bootstrap) {\n      button.setAttribute("data-bs-toggle", "tooltip");\n      button.setAttribute("data-bs-placement", "left");\n      button.setAttribute("data-bs-title", "Copied!");\n      tooltip = new bootstrap.Tooltip(button, \n        { trigger: "manual", \n          customClass: "code-copy-button-tooltip",\n          offset: [0, -8]});\n      tooltip.show();    \n    }\n    setTimeout(function() {\n      if (tooltip) {\n        tooltip.hide();\n        button.removeAttribute("data-bs-title");\n        button.removeAttribute("data-bs-toggle");\n        button.removeAttribute("data-bs-placement");\n      }\n      button.setAttribute("title", currentTitle);\n      button.classList.remove(\'code-copy-button-checked\');\n    }, 1000);\n    // clear code selection\n    e.clearSelection();\n  });\n    var localhostRegex = new RegExp(/^(?:http|https):\\/\\/localhost\\:?[0-9]*\\//);\n    var mailtoRegex = new RegExp(/^mailto:/);\n      var filterRegex = new RegExp(\'/\' + window.location.host + \'/\');\n    var isInternal = (href) => {\n        return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);\n    }\n    // Inspect non-navigation links and adorn them if external\n \tvar links = window.document.querySelectorAll(\'a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool)\');\n    for (var i=0; i<links.length; i++) {\n      const link = links[i];\n      if (!isInternal(link.href)) {\n        // undo the damage that might have been done by quarto-nav.js in the case of\n        // links that we want to consider external\n        if (link.dataset.originalHref !== undefined) {\n          link.href = link.dataset.originalHref;\n        }\n      }\n    }\n  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {\n    const config = {\n      allowHTML: true,\n      maxWidth: 500,\n      delay: 100,\n      arrow: false,\n      appendTo: function(el) {\n          return el.parentElement;\n      },\n      interactive: true,\n      interactiveBorder: 10,\n      theme: \'quarto\',\n      placement: \'bottom-start\',\n    };\n    if (contentFn) {\n      config.content = contentFn;\n    }\n    if (onTriggerFn) {\n      config.onTrigger = onTriggerFn;\n    }\n    if (onUntriggerFn) {\n      config.onUntrigger = onUntriggerFn;\n    }\n    window.tippy(el, config); \n  }\n  const noterefs = window.document.querySelectorAll(\'a[role="doc-noteref"]\');\n  for (var i=0; i<noterefs.length; i++) {\n    const ref = noterefs[i];\n    tippyHover(ref, function() {\n      // use id or data attribute instead here\n      let href = ref.getAttribute(\'data-footnote-href\') || ref.getAttribute(\'href\');\n      try { href = new URL(href).hash; } catch {}\n      const id = href.replace(/^#\\/?/, "");\n      const note = window.document.getElementById(id);\n      if (note) {\n        return note.innerHTML;\n      } else {\n        return "";\n      }\n    });\n  }\n  const xrefs = window.document.querySelectorAll(\'a.quarto-xref\');\n  const processXRef = (id, note) => {\n    // Strip column container classes\n    const stripColumnClz = (el) => {\n      el.classList.remove("page-full", "page-columns");\n      if (el.children) {\n        for (const child of el.children) {\n          stripColumnClz(child);\n        }\n      }\n    }\n    stripColumnClz(note)\n    if (id === null || id.startsWith(\'sec-\')) {\n      // Special case sections, only their first couple elements\n      const container = document.createElement("div");\n      if (note.children && note.children.length > 2) {\n        container.appendChild(note.children[0].cloneNode(true));\n        for (let i = 1; i < note.children.length; i++) {\n          const child = note.children[i];\n          if (child.tagName === "P" && child.innerText === "") {\n            continue;\n          } else {\n            container.appendChild(child.cloneNode(true));\n            break;\n          }\n        }\n        if (window.Quarto?.typesetMath) {\n          window.Quarto.typesetMath(container);\n        }\n        return container.innerHTML\n      } else {\n        if (window.Quarto?.typesetMath) {\n          window.Quarto.typesetMath(note);\n        }\n        return note.innerHTML;\n      }\n    } else {\n      // Remove any anchor links if they are present\n      const anchorLink = note.querySelector(\'a.anchorjs-link\');\n      if (anchorLink) {\n        anchorLink.remove();\n      }\n      if (window.Quarto?.typesetMath) {\n        window.Quarto.typesetMath(note);\n      }\n      // TODO in 1.5, we should make sure this works without a callout special case\n      if (note.classList.contains("callout")) {\n        return note.outerHTML;\n      } else {\n        return note.innerHTML;\n      }\n    }\n  }\n  for (var i=0; i<xrefs.length; i++) {\n    const xref = xrefs[i];\n    tippyHover(xref, undefined, function(instance) {\n      instance.disable();\n      let url = xref.getAttribute(\'href\');\n      let hash = undefined; \n      if (url.startsWith(\'#\')) {\n        hash = url;\n      } else {\n        try { hash = new URL(url).hash; } catch {}\n      }\n      if (hash) {\n        const id = hash.replace(/^#\\/?/, "");\n        const note = window.document.getElementById(id);\n        if (note !== null) {\n          try {\n            const html = processXRef(id, note.cloneNode(true));\n            instance.setContent(html);\n          } finally {\n            instance.enable();\n            instance.show();\n          }\n        } else {\n          // See if we can fetch this\n          fetch(url.split(\'#\')[0])\n          .then(res => res.text())\n          .then(html => {\n            const parser = new DOMParser();\n            const htmlDoc = parser.parseFromString(html, "text/html");\n            const note = htmlDoc.getElementById(id);\n            if (note !== null) {\n              const html = processXRef(id, note);\n              instance.setContent(html);\n            } \n          }).finally(() => {\n            instance.enable();\n            instance.show();\n          });\n        }\n      } else {\n        // See if we can fetch a full url (with no hash to target)\n        // This is a special case and we should probably do some content thinning / targeting\n        fetch(url)\n        .then(res => res.text())\n        .then(html => {\n          const parser = new DOMParser();\n          const htmlDoc = parser.parseFromString(html, "text/html");\n          const note = htmlDoc.querySelector(\'main.content\');\n          if (note !== null) {\n            // This should only happen for chapter cross references\n            // (since there is no id in the URL)\n            // remove the first header\n            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {\n              note.children[0].remove();\n            }\n            const html = processXRef(null, note);\n            instance.setContent(html);\n          } \n        }).finally(() => {\n          instance.enable();\n          instance.show();\n        });\n      }\n    }, function(instance) {\n    });\n  }\n      let selectedAnnoteEl;\n      const selectorForAnnotation = ( cell, annotation) => {\n        let cellAttr = \'data-code-cell="\' + cell + \'"\';\n        let lineAttr = \'data-code-annotation="\' +  annotation + \'"\';\n        const selector = \'span[\' + cellAttr + \'][\' + lineAttr + \']\';\n        return selector;\n      }\n      const selectCodeLines = (annoteEl) => {\n        const doc = window.document;\n        const targetCell = annoteEl.getAttribute("data-target-cell");\n        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");\n        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));\n        const lines = annoteSpan.getAttribute("data-code-lines").split(",");\n        const lineIds = lines.map((line) => {\n          return targetCell + "-" + line;\n        })\n        let top = null;\n        let height = null;\n        let parent = null;\n        if (lineIds.length > 0) {\n            //compute the position of the single el (top and bottom and make a div)\n            const el = window.document.getElementById(lineIds[0]);\n            top = el.offsetTop;\n            height = el.offsetHeight;\n            parent = el.parentElement.parentElement;\n          if (lineIds.length > 1) {\n            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);\n            const bottom = lastEl.offsetTop + lastEl.offsetHeight;\n            height = bottom - top;\n          }\n          if (top !== null && height !== null && parent !== null) {\n            // cook up a div (if necessary) and position it \n            let div = window.document.getElementById("code-annotation-line-highlight");\n            if (div === null) {\n              div = window.document.createElement("div");\n              div.setAttribute("id", "code-annotation-line-highlight");\n              div.style.position = \'absolute\';\n              parent.appendChild(div);\n            }\n            div.style.top = top - 2 + "px";\n            div.style.height = height + 4 + "px";\n            div.style.left = 0;\n            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");\n            if (gutterDiv === null) {\n              gutterDiv = window.document.createElement("div");\n              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");\n              gutterDiv.style.position = \'absolute\';\n              const codeCell = window.document.getElementById(targetCell);\n              const gutter = codeCell.querySelector(\'.code-annotation-gutter\');\n              gutter.appendChild(gutterDiv);\n            }\n            gutterDiv.style.top = top - 2 + "px";\n            gutterDiv.style.height = height + 4 + "px";\n          }\n          selectedAnnoteEl = annoteEl;\n        }\n      };\n      const unselectCodeLines = () => {\n        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];\n        elementsIds.forEach((elId) => {\n          const div = window.document.getElementById(elId);\n          if (div) {\n            div.remove();\n          }\n        });\n        selectedAnnoteEl = undefined;\n      };\n        // Handle positioning of the toggle\n    window.addEventListener(\n      "resize",\n      throttle(() => {\n        elRect = undefined;\n        if (selectedAnnoteEl) {\n          selectCodeLines(selectedAnnoteEl);\n        }\n      }, 10)\n    );\n    function throttle(fn, ms) {\n    let throttle = false;\n    let timer;\n      return (...args) => {\n        if(!throttle) { // first call gets through\n            fn.apply(this, args);\n            throttle = true;\n        } else { // all the others get throttled\n            if(timer) clearTimeout(timer); // cancel #2\n            timer = setTimeout(() => {\n              fn.apply(this, args);\n              timer = throttle = false;\n            }, ms);\n        }\n      };\n    }\n      // Attach click handler to the DT\n      const annoteDls = window.document.querySelectorAll(\'dt[data-target-cell]\');\n      for (const annoteDlNode of annoteDls) {\n        annoteDlNode.addEventListener(\'click\', (event) => {\n          const clickedEl = event.target;\n          if (clickedEl !== selectedAnnoteEl) {\n            unselectCodeLines();\n            const activeEl = window.document.querySelector(\'dt[data-target-cell].code-annotation-active\');\n            if (activeEl) {\n              activeEl.classList.remove(\'code-annotation-active\');\n            }\n            selectCodeLines(clickedEl);\n            clickedEl.classList.add(\'code-annotation-active\');\n          } else {\n            // Unselect the line\n            unselectCodeLines();\n            clickedEl.classList.remove(\'code-annotation-active\');\n          }\n        });\n      }\n  const findCites = (el) => {\n    const parentEl = el.parentElement;\n    if (parentEl) {\n      const cites = parentEl.dataset.cites;\n      if (cites) {\n        return {\n          el,\n          cites: cites.split(\' \')\n        };\n      } else {\n        return findCites(el.parentElement)\n      }\n    } else {\n      return undefined;\n    }\n  };\n  var bibliorefs = window.document.querySelectorAll(\'a[role="doc-biblioref"]\');\n  for (var i=0; i<bibliorefs.length; i++) {\n    const ref = bibliorefs[i];\n    const citeInfo = findCites(ref);\n    if (citeInfo) {\n      tippyHover(citeInfo.el, function() {\n        var popup = window.document.createElement(\'div\');\n        citeInfo.cites.forEach(function(cite) {\n          var citeDiv = window.document.createElement(\'div\');\n          citeDiv.classList.add(\'hanging-indent\');\n          citeDiv.classList.add(\'csl-entry\');\n          var biblioDiv = window.document.getElementById(\'ref-\' + cite);\n          if (biblioDiv) {\n            citeDiv.innerHTML = biblioDiv.innerHTML;\n          }\n          popup.appendChild(citeDiv);\n        });\n        return popup.innerHTML;\n      });\n    }\n  }\n});\n</script>\n</div> <!-- /content -->\n<footer class="footer">\n  <div class="nav-footer">\n    <div class="nav-footer-left">\n      &nbsp;\n    </div>   \n    <div class="nav-footer-center">\n<p>Copyright 2023, Pingjian Ding</p>\n</div>\n    <div class="nav-footer-right">\n      &nbsp;\n    </div>\n  </div>\n</footer>\n\n\n\n\n</body></html>'

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p></html>
"""
from bs4 import BeautifulSoup
parsed = BeautifulSoup(html_doc, 'html.parser')
print(parsed.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>

print(type(parsed.title))
print(parsed.title)

<class 'bs4.element.Tag'>
<title>The Dormouse's story</title>

parsed.title.name

'title'

parsed.title.string

"The Dormouse's story"

parsed.a

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

# * 找到所有名为‘a’的标签，这是HTML中用于超链接的标签。
parsed.find_all('a')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

# * 在名为‘a’的标签中的‘href’属性包含用于超链接的实际URL。
for link in parsed.find_all('a'):
    print(link.get('href'))

http://example.com/elsie
http://example.com/lacie
http://example.com/tillie

# BeautifulSoup标签具有可以像字典一样访问的HTML属性：
shortdoc = """
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
"""
pshort = BeautifulSoup(shortdoc, 'html.parser')
print(pshort.p['class'])

['story']

# BeautifulSoup标签具有可以作为Python属性访问的子代：
print(pshort.p.a)

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

# * 通过标签名获得树中的第一个的标签
print(parsed.title)
print(parsed.a)
print(parsed.p)

<title>The Dormouse's story</title>
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
<p class="title"><b>The Dormouse's story</b></p>

# * 如果标签的子代是一个字符串(叶子节点)，用tag.string访问
parsed.title.string

"The Dormouse's story"

# * 通过标签的标签来深度遍历树
print(parsed.p.b)

<b>The Dormouse's story</b>

# * 用.contents访问标签的子代列表
pshort.p.contents

['Once upon a time there were three little sisters; and their names were\n',
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 ',\n',
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 ' and\n',
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>,
 ';\nand they lived at the bottom of a well.']

# * 或者用.children获取子代信息（Python迭代器）
print(pshort.p.children)
for i in pshort.p.children:
    print(i)

<list_iterator object at 0x000001D8B80821D0>
Once upon a time there were three little sisters; and their names were

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
,

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
 and

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
;
and they lived at the bottom of a well.

# * 用.descendants递归遍历树
print(pshort.p.descendants)
for i in pshort.p.descendants:
    print(i)

<generator object Tag.descendants at 0x000001D8B856DE00>
Once upon a time there were three little sisters; and their names were

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
Elsie
,

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
Lacie
 and

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
Tillie
;
and they lived at the bottom of a well.

link = parsed.a
link

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

# 用.parent访问标签的父标签
link.parent

<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

link.parents

<generator object PageElement.parents at 0x000001E188017F40>

# 用.parents获取回到根标签的整个父链
for parent in link.parents:
    print(parent.name)

p
body
html
[document]

# 用.previous_sibling和.next_sibling在树中“左”和“右”移动
print(link.previous_sibling)
print(link.next_sibling)

Once upon a time there were three little sisters; and their names were

,

# 找到所有名为‘p’的标签
parsed.find_all('p')

[<p class="title"><b>The Dormouse's story</b></p>,
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

# 找到所有名称匹配‘a’或‘b’的标签
parsed.find_all(['a', 'b'])

[<b>The Dormouse's story</b>,
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

# 找到所有名称匹配给定正则表达式的标签。
import re
parsed.find_all(re.compile(r'^b'))

[<body>
 <p class="title"><b>The Dormouse's story</b></p>
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>
 <p class="story">...</p>
 </body>,
 <b>The Dormouse's story</b>]

# 传入一个函数，该函数根据标签返回True/False，find_all将只返回评估为True的标签
def has_class_but_no_id(tag):
    return tag.has_attr('class') and not tag.has_attr('id')
parsed.find_all(has_class_but_no_id)

[<p class="title"><b>The Dormouse's story</b></p>,
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

# 这个<p>标签包含一个完整的句子，但句子的某些部分是链接，所以p.string失败。如果我想在没有链接的情况下获取完整的字符串，我该怎么办？
print(pshort)
print(pshort.p.string is None)

<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

True

pshort.p.contents

['Once upon a time there were three little sisters; and their names were\n',
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 ',\n',
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 ' and\n',
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>,
 ';\nand they lived at the bottom of a well.']

# 注意：在BeautifulSoup中尝试访问不存在的tag.string是常见导致错误/错误的原因！
pshort.p.get_text()

'Once upon a time there were three little sisters; and their names were\nElsie,\nLacie and\nTillie;\nand they lived at the bottom of a well.'

with open('short.xml', 'r') as f:
	file = f.read() 
from bs4 import BeautifulSoup
pshort = BeautifulSoup(file, 'xml')
print(pshort.prettify())

<?xml version="1.0" encoding="utf-8"?>
<teachers>
 <teacher>
  <name>
   Sam Davies
  </name>
  <age>
   35
  </age>
  <subject>
   Maths
  </subject>
 </teacher>
 <teacher>
  <name>
   Cassie Stone
  </name>
  <age>
   24
  </age>
  <subject>
   Science
  </subject>
 </teacher>
 <teacher>
  <name>
   Derek Brandon
  </name>
  <age>
   32
  </age>
  <subject>
   History
  </subject>
 </teacher>
</teachers>

pshort.find_all('name')

[<name>Sam Davies</name>,
 <name>Cassie Stone</name>,
 <name>Derek Brandon</name>]

for i in pshort.teachers.children:
    print(i)


<teacher>
<name>Sam Davies</name>
<age>35</age>
<subject>Maths</subject>
</teacher>


<teacher>
<name>Cassie Stone</name>
<age>24</age>
<subject>Science</subject>
</teacher>


<teacher>
<name>Derek Brandon</name>
<age>32</age>
<subject>History</subject>
</teacher>

import json
with open("example.json") as f:
    data = json.load(f)
data

{'first_name': 'John',
 'last_name': 'Smith',
 'is_alive': True,
 'age': 27,
 'address': {'street_address': '21 2nd Street',
  'city': 'New York',
  'state': 'NY',
  'postal_code': '10021-3100'},
 'phone_numbers': [{'type': 'home', 'number': '212 555-1234'},
  {'type': 'office', 'number': '646 555-4567'}],
 'children': ['Catherine', 'Thomas', 'Trevor'],
 'spouse': None}

import json
json_string = '{"first_name": "John", "last_name": "Smith",\
    "alma_mater": "Princeton University"}'
# json.loads解析字符串并返回JSON对象。
parsed_json = json.loads(json_string)
print(type(parsed_json))
parsed_json

<class 'dict'>

{'first_name': 'John',
 'last_name': 'Smith',
 'alma_mater': 'Princeton University'}

# json.dumps将JSON对象重新转换为字符串。
json.dumps(parsed_json)

'{"first_name": "John", "last_name": "Smith", "alma_mater": "Princeton University"}'

type(parsed_json), parsed_json

(dict,
 {'first_name': 'John',
  'last_name': 'Smith',
  'alma_mater': 'Princeton University'})

parsed_json['first_name']

'John'

with open("example2.json") as f:
    data = json.load(f)
data

{'id': '0001',
 'type': 'donut',
 'name': 'Cake',
 'ppu': 0.55,
 'batters': {'batter': [{'id': '1001', 'type': 'Regular'},
   {'id': '1002', 'type': 'Chocolate'},
   {'id': '1003', 'type': 'Blueberry'},
   {'id': '1004', 'type': "Devil's Food"}]},
 'topping': [{'id': '5001', 'type': 'None'},
  {'id': '5002', 'type': 'Glazed'},
  {'id': '5005', 'type': 'Sugar'},
  {'id': '5007', 'type': 'Powdered Sugar'},
  {'id': '5006', 'type': 'Chocolate'},
  {'id': '5003', 'type': 'Maple'},
  {'id': '5004', 'type': 'Raisin'}]}

Python数据处理¶

07. 来自网络的结构化数据¶

网站上有很多有趣的数据¶

网络数据的三个方面¶

客户端-服务器模型¶

URL的解剖¶

在Python中访问网站：urllib¶

使用urllib¶

getcode()¶

geturl()¶

info()¶

HTML速成课程¶

HTML速成课程¶

HTML速成课程：回顾¶

回到urllib¶

在Python中解析HTML/XML：beautifulsoup¶

在Python中解析HTML/XML：beautifulsoup¶

示例（来自BeautifulSoup文档）¶

BeautifulSoup允许导航HTML标签¶

关于属性的说明¶

HTML树结构¶

导航HTML树¶

导航HTML树¶

导航HTML树¶

搜索树：find_all和相关方法¶

关于find_all的更多信息¶

内容扁平化：get_text()¶

XML - 可扩展标记语言，.xml¶

JSON - JavaScript对象表示法¶

Python json模块¶

Python json模块¶

JSON对象可以有非常复杂的结构¶