{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "TtngHw6436m1"
      },
      "source": [
        "## Text data\n",
        "\n",
        "### [Kaggle fake news dataset](https://www.kaggle.com/datasets/clmentbisaillon/fake-and-real-news-dataset?resource=download)\n",
        "\n",
        "### [Spacy tutorial](https://www.kaggle.com/code/sudalairajkumar/getting-started-with-spacy)\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 4,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "gByzavMFiGPZ",
        "outputId": "501f4383-d2da-4d3b-af0e-18aafd4176dd"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Collecting en-core-web-md==3.7.1\n",
            "  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.8/42.8 MB\u001b[0m \u001b[31m18.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: spacy<3.8.0,>=3.7.2 in /usr/local/lib/python3.11/dist-packages (from en-core-web-md==3.7.1) (3.7.5)\n",
            "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /usr/local/lib/python3.11/dist-packages (from spacy<3.8.0,>=3.7.2->en-core-web-md==3.7.1) (3.0.12)\n",
            "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /usr/local/lib/python3.11/dist-packages (from spacy<3.8.0,>=3.7.2->en-core-web-md==3.7.1) (1.0.5)\n",
            "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.11/dist-packages (from spacy<3.8.0,>=3.7.2->en-core-web-md==3.7.1) (1.0.12)\n",
            "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.11/dist-packages (from spacy<3.8.0,>=3.7.2->en-core-web-md==3.7.1) (2.0.11)\n",
            "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.11/dist-packages (from spacy<3.8.0,>=3.7.2->en-core-web-md==3.7.1) (3.0.9)\n",
            "Requirement already satisfied: thinc<8.3.0,>=8.2.2 in /usr/local/lib/python3.11/dist-packages (from spacy<3.8.0,>=3.7.2->en-core-web-md==3.7.1) (8.2.5)\n",
            "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /usr/local/lib/python3.11/dist-packages (from spacy<3.8.0,>=3.7.2->en-core-web-md==3.7.1) (1.1.3)\n",
            "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /usr/local/lib/python3.11/dist-packages (from spacy<3.8.0,>=3.7.2->en-core-web-md==3.7.1) (2.5.1)\n",
            "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /usr/local/lib/python3.11/dist-packages (from spacy<3.8.0,>=3.7.2->en-core-web-md==3.7.1) (2.0.10)\n",
            "Requirement already satisfied: weasel<0.5.0,>=0.1.0 in /usr/local/lib/python3.11/dist-packages (from spacy<3.8.0,>=3.7.2->en-core-web-md==3.7.1) (0.4.1)\n",
            "Requirement already satisfied: typer<1.0.0,>=0.3.0 in /usr/local/lib/python3.11/dist-packages (from spacy<3.8.0,>=3.7.2->en-core-web-md==3.7.1) (0.15.1)\n",
            "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.11/dist-packages (from spacy<3.8.0,>=3.7.2->en-core-web-md==3.7.1) (4.67.1)\n",
            "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.11/dist-packages (from spacy<3.8.0,>=3.7.2->en-core-web-md==3.7.1) (2.32.3)\n",
            "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /usr/local/lib/python3.11/dist-packages (from spacy<3.8.0,>=3.7.2->en-core-web-md==3.7.1) (2.10.6)\n",
            "Requirement already satisfied: jinja2 in /usr/local/lib/python3.11/dist-packages (from spacy<3.8.0,>=3.7.2->en-core-web-md==3.7.1) (3.1.5)\n",
            "Requirement already satisfied: setuptools in /usr/local/lib/python3.11/dist-packages (from spacy<3.8.0,>=3.7.2->en-core-web-md==3.7.1) (75.1.0)\n",
            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.11/dist-packages (from spacy<3.8.0,>=3.7.2->en-core-web-md==3.7.1) (24.2)\n",
            "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /usr/local/lib/python3.11/dist-packages (from spacy<3.8.0,>=3.7.2->en-core-web-md==3.7.1) (3.5.0)\n",
            "Requirement already satisfied: numpy>=1.19.0 in /usr/local/lib/python3.11/dist-packages (from spacy<3.8.0,>=3.7.2->en-core-web-md==3.7.1) (1.26.4)\n",
            "Requirement already satisfied: language-data>=1.2 in /usr/local/lib/python3.11/dist-packages (from langcodes<4.0.0,>=3.2.0->spacy<3.8.0,>=3.7.2->en-core-web-md==3.7.1) (1.3.0)\n",
            "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.11/dist-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.2->en-core-web-md==3.7.1) (0.7.0)\n",
            "Requirement already satisfied: pydantic-core==2.27.2 in /usr/local/lib/python3.11/dist-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.2->en-core-web-md==3.7.1) (2.27.2)\n",
            "Requirement already satisfied: typing-extensions>=4.12.2 in /usr/local/lib/python3.11/dist-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.2->en-core-web-md==3.7.1) (4.12.2)\n",
            "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-md==3.7.1) (3.4.1)\n",
            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-md==3.7.1) (3.10)\n",
            "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-md==3.7.1) (2.3.0)\n",
            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-md==3.7.1) (2025.1.31)\n",
            "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /usr/local/lib/python3.11/dist-packages (from thinc<8.3.0,>=8.2.2->spacy<3.8.0,>=3.7.2->en-core-web-md==3.7.1) (0.7.11)\n",
            "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /usr/local/lib/python3.11/dist-packages (from thinc<8.3.0,>=8.2.2->spacy<3.8.0,>=3.7.2->en-core-web-md==3.7.1) (0.1.5)\n",
            "Requirement already satisfied: click>=8.0.0 in /usr/local/lib/python3.11/dist-packages (from typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.2->en-core-web-md==3.7.1) (8.1.8)\n",
            "Requirement already satisfied: shellingham>=1.3.0 in /usr/local/lib/python3.11/dist-packages (from typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.2->en-core-web-md==3.7.1) (1.5.4)\n",
            "Requirement already satisfied: rich>=10.11.0 in /usr/local/lib/python3.11/dist-packages (from typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.2->en-core-web-md==3.7.1) (13.9.4)\n",
            "Requirement already satisfied: cloudpathlib<1.0.0,>=0.7.0 in /usr/local/lib/python3.11/dist-packages (from weasel<0.5.0,>=0.1.0->spacy<3.8.0,>=3.7.2->en-core-web-md==3.7.1) (0.20.0)\n",
            "Requirement already satisfied: smart-open<8.0.0,>=5.2.1 in /usr/local/lib/python3.11/dist-packages (from weasel<0.5.0,>=0.1.0->spacy<3.8.0,>=3.7.2->en-core-web-md==3.7.1) (7.1.0)\n",
            "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from jinja2->spacy<3.8.0,>=3.7.2->en-core-web-md==3.7.1) (3.0.2)\n",
            "Requirement already satisfied: marisa-trie>=1.1.0 in /usr/local/lib/python3.11/dist-packages (from language-data>=1.2->langcodes<4.0.0,>=3.2.0->spacy<3.8.0,>=3.7.2->en-core-web-md==3.7.1) (1.2.1)\n",
            "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.11/dist-packages (from rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.2->en-core-web-md==3.7.1) (3.0.0)\n",
            "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.11/dist-packages (from rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.2->en-core-web-md==3.7.1) (2.18.0)\n",
            "Requirement already satisfied: wrapt in /usr/local/lib/python3.11/dist-packages (from smart-open<8.0.0,>=5.2.1->weasel<0.5.0,>=0.1.0->spacy<3.8.0,>=3.7.2->en-core-web-md==3.7.1) (1.17.2)\n",
            "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.11/dist-packages (from markdown-it-py>=2.2.0->rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.2->en-core-web-md==3.7.1) (0.1.2)\n",
            "Installing collected packages: en-core-web-md\n",
            "Successfully installed en-core-web-md-3.7.1\n",
            "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n",
            "You can now load the package via spacy.load('en_core_web_md')\n",
            "\u001b[38;5;3m⚠ Restart to reload dependencies\u001b[0m\n",
            "If you are in a Jupyter or Colab notebook, you may need to restart Python in\n",
            "order to load all the package's dependencies. You can do this by selecting the\n",
            "'Restart kernel' or 'Restart runtime' option.\n"
          ]
        }
      ],
      "source": [
        "! python -m spacy download en_core_web_md"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 5,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "4GOUq5ZB1PkA",
        "outputId": "052002d2-09e9-402f-8b23-63adc9bd90b5"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Requirement already satisfied: spacytextblob in /usr/local/lib/python3.11/dist-packages (5.0.0)\n",
            "Requirement already satisfied: spacy>=3.0.0 in /usr/local/lib/python3.11/dist-packages (from spacytextblob) (3.7.5)\n",
            "Requirement already satisfied: textblob>=0.18.0.post0 in /usr/local/lib/python3.11/dist-packages (from spacytextblob) (0.19.0)\n",
            "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.0.0->spacytextblob) (3.0.12)\n",
            "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.0.0->spacytextblob) (1.0.5)\n",
            "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.0.0->spacytextblob) (1.0.12)\n",
            "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.0.0->spacytextblob) (2.0.11)\n",
            "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.0.0->spacytextblob) (3.0.9)\n",
            "Requirement already satisfied: thinc<8.3.0,>=8.2.2 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.0.0->spacytextblob) (8.2.5)\n",
            "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.0.0->spacytextblob) (1.1.3)\n",
            "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.0.0->spacytextblob) (2.5.1)\n",
            "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.0.0->spacytextblob) (2.0.10)\n",
            "Requirement already satisfied: weasel<0.5.0,>=0.1.0 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.0.0->spacytextblob) (0.4.1)\n",
            "Requirement already satisfied: typer<1.0.0,>=0.3.0 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.0.0->spacytextblob) (0.15.1)\n",
            "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.0.0->spacytextblob) (4.67.1)\n",
            "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.0.0->spacytextblob) (2.32.3)\n",
            "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.0.0->spacytextblob) (2.10.6)\n",
            "Requirement already satisfied: jinja2 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.0.0->spacytextblob) (3.1.5)\n",
            "Requirement already satisfied: setuptools in /usr/local/lib/python3.11/dist-packages (from spacy>=3.0.0->spacytextblob) (75.1.0)\n",
            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.0.0->spacytextblob) (24.2)\n",
            "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.0.0->spacytextblob) (3.5.0)\n",
            "Requirement already satisfied: numpy>=1.19.0 in /usr/local/lib/python3.11/dist-packages (from spacy>=3.0.0->spacytextblob) (1.26.4)\n",
            "Requirement already satisfied: nltk>=3.9 in /usr/local/lib/python3.11/dist-packages (from textblob>=0.18.0.post0->spacytextblob) (3.9.1)\n",
            "Requirement already satisfied: language-data>=1.2 in /usr/local/lib/python3.11/dist-packages (from langcodes<4.0.0,>=3.2.0->spacy>=3.0.0->spacytextblob) (1.3.0)\n",
            "Requirement already satisfied: click in /usr/local/lib/python3.11/dist-packages (from nltk>=3.9->textblob>=0.18.0.post0->spacytextblob) (8.1.8)\n",
            "Requirement already satisfied: joblib in /usr/local/lib/python3.11/dist-packages (from nltk>=3.9->textblob>=0.18.0.post0->spacytextblob) (1.4.2)\n",
            "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.11/dist-packages (from nltk>=3.9->textblob>=0.18.0.post0->spacytextblob) (2024.11.6)\n",
            "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.11/dist-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy>=3.0.0->spacytextblob) (0.7.0)\n",
            "Requirement already satisfied: pydantic-core==2.27.2 in /usr/local/lib/python3.11/dist-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy>=3.0.0->spacytextblob) (2.27.2)\n",
            "Requirement already satisfied: typing-extensions>=4.12.2 in /usr/local/lib/python3.11/dist-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy>=3.0.0->spacytextblob) (4.12.2)\n",
            "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=3.0.0->spacytextblob) (3.4.1)\n",
            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=3.0.0->spacytextblob) (3.10)\n",
            "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=3.0.0->spacytextblob) (2.3.0)\n",
            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=3.0.0->spacytextblob) (2025.1.31)\n",
            "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /usr/local/lib/python3.11/dist-packages (from thinc<8.3.0,>=8.2.2->spacy>=3.0.0->spacytextblob) (0.7.11)\n",
            "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /usr/local/lib/python3.11/dist-packages (from thinc<8.3.0,>=8.2.2->spacy>=3.0.0->spacytextblob) (0.1.5)\n",
            "Requirement already satisfied: shellingham>=1.3.0 in /usr/local/lib/python3.11/dist-packages (from typer<1.0.0,>=0.3.0->spacy>=3.0.0->spacytextblob) (1.5.4)\n",
            "Requirement already satisfied: rich>=10.11.0 in /usr/local/lib/python3.11/dist-packages (from typer<1.0.0,>=0.3.0->spacy>=3.0.0->spacytextblob) (13.9.4)\n",
            "Requirement already satisfied: cloudpathlib<1.0.0,>=0.7.0 in /usr/local/lib/python3.11/dist-packages (from weasel<0.5.0,>=0.1.0->spacy>=3.0.0->spacytextblob) (0.20.0)\n",
            "Requirement already satisfied: smart-open<8.0.0,>=5.2.1 in /usr/local/lib/python3.11/dist-packages (from weasel<0.5.0,>=0.1.0->spacy>=3.0.0->spacytextblob) (7.1.0)\n",
            "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from jinja2->spacy>=3.0.0->spacytextblob) (3.0.2)\n",
            "Requirement already satisfied: marisa-trie>=1.1.0 in /usr/local/lib/python3.11/dist-packages (from language-data>=1.2->langcodes<4.0.0,>=3.2.0->spacy>=3.0.0->spacytextblob) (1.2.1)\n",
            "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.11/dist-packages (from rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy>=3.0.0->spacytextblob) (3.0.0)\n",
            "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.11/dist-packages (from rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy>=3.0.0->spacytextblob) (2.18.0)\n",
            "Requirement already satisfied: wrapt in /usr/local/lib/python3.11/dist-packages (from smart-open<8.0.0,>=5.2.1->weasel<0.5.0,>=0.1.0->spacy>=3.0.0->spacytextblob) (1.17.2)\n",
            "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.11/dist-packages (from markdown-it-py>=2.2.0->rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy>=3.0.0->spacytextblob) (0.1.2)\n"
          ]
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "<spacytextblob.spacytextblob.SpacyTextBlob at 0x7ebe69252310>"
            ]
          },
          "metadata": {},
          "execution_count": 5
        }
      ],
      "source": [
        "# Downloading the files needed to load the spacy language model\n",
        "!pip3 install spacytextblob\n",
        "\n",
        "import pandas as pd\n",
        "from datetime import datetime # for grabbing date ranges\n",
        "import spacy # for natural language processing\n",
        "from spacytextblob.spacytextblob import SpacyTextBlob # for sentiment analysis\n",
        "\n",
        "# This is loading the spacy language model\n",
        "nlp = spacy.load('en_core_web_md')\n",
        "nlp.add_pipe('spacytextblob')\n",
        "\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "u5_OROd8o3VM"
      },
      "source": [
        "### NLP crash course with spacy\n",
        "\n",
        "+ A text dataset is often called a corpus. Especially if it has been curated in some fashion (labeled, annotated, ...).\n",
        "\n",
        "+ Spacy is a python package that performs standard NLP analyses of text.\n",
        "\n",
        "+ Tokenization: Splitting text into units\n",
        "  - Sentence\n",
        "  - Word based tokenization (wasn't)\n",
        "  - Subword based tokenization (morpheme-like units)\n",
        "    + Byte pair encoding\n",
        "    + WordPiece\n",
        "    + Unigram\n",
        "    + SentencePiece\n",
        "  - Character based tokenization\n",
        "+ Lemmatization: mapping word variants to canonical root form\n",
        "  - removing pluralization\n",
        "  - removing tense\n",
        "+ Part of speech tagging: Labeling words with part of speech (verb, noun, etc.)\n",
        "+ Noun phrase identification\n",
        "+ Named entity recognition (NER)\n",
        "  - person, place, country, currency, ...\n",
        "+ Sentiment analysis"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 6,
      "metadata": {
        "id": "ZnJM1lrwrR7Y",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "1325225a-e84b-4340-bbfc-b8ae59ee2750"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "['Dr.', 'James', 'Harvey', ',', 'the', 'big', 'furry', 'cat', 'ate', 'the', 'little', 'brown', 'mice', 'who', 'were', \"n't\", 'very', 'happy', '.']\n",
            "['Dr.', 'James', 'Harvey', ',', 'the', 'big', 'furry', 'cat', 'eat', 'the', 'little', 'brown', 'mouse', 'who', 'be', 'not', 'very', 'happy', '.']\n",
            "['PROPN', 'PROPN', 'PROPN', 'PUNCT', 'DET', 'ADJ', 'ADJ', 'NOUN', 'VERB', 'DET', 'ADJ', 'ADJ', 'NOUN', 'PRON', 'AUX', 'PART', 'ADV', 'ADJ', 'PUNCT']\n",
            "['NNP', 'NNP', 'NNP', ',', 'DT', 'JJ', 'JJ', 'NN', 'VBD', 'DT', 'JJ', 'JJ', 'NNS', 'WP', 'VBD', 'RB', 'RB', 'JJ', '.']\n",
            "[Dr. James Harvey, the big furry cat, the little brown mice, who]\n",
            "adjective (English), other noun-modifier (Chinese)\n",
            "('CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART')\n",
            "['James Harvey, PERSON, People, including fictional']\n"
          ]
        }
      ],
      "source": [
        "# Showing you attributes of tokens you have access to in Spacy.\n",
        "\n",
        "# Example text for demonstrations\n",
        "text = \"Dr. James Harvey, the big furry cat ate the little brown mice who weren't very happy.\"\n",
        "\n",
        "doc = nlp(text)\n",
        "\n",
        "# Using a python \"list comprehension\" in order to go through all the tokens in the document\"\n",
        "print([t.text for t in doc]) # word based tokenization\n",
        "print([t.lemma_ for t in doc]) # lemmatization\n",
        "print([t.pos_ for t in doc]) # Part of speech tagging\n",
        "print([t.tag_ for t in doc]) # Fine grained part of speech tagging\n",
        "print([n for n in doc.noun_chunks]) # Noun phrase parsing\n",
        "print(spacy.explain('JJ'))\n",
        "print(nlp.get_pipe('ner').labels)\n",
        "print([f'{ent.text}, {ent.label_}, {spacy.explain(ent.label_)}' for ent in doc.ents])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 8,
      "metadata": {
        "id": "297oBrPC1ZpT"
      },
      "outputs": [],
      "source": [
        "fake = pd.read_csv('https://fw.cs.wwu.edu/~hutchib2/doc/Fake.csv')\n",
        "real = pd.read_csv('https://fw.cs.wwu.edu/~hutchib2/doc/True.csv')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 9,
      "metadata": {
        "id": "espKevIY1vNB"
      },
      "outputs": [],
      "source": [
        "def clean_dates(df):\n",
        "  \"\"\"\n",
        "  Translates dates from various string formats to python Datetime objects.\n",
        "  Adds columns Date: Datetime, month: int, year: int, day: int.\n",
        "  Also tosses out corrupted rows in the dataframe.\n",
        "\n",
        "  :param df: (DataFrame) With column 'date'\n",
        "\n",
        "  :returns: (DataFrame) Cleaned with additional columns Date, month, day, year\n",
        "  \"\"\"\n",
        "  formats = {1: '%d-%b-%y', 0: '%B %d, %Y', 2: '%b %d, %Y'}\n",
        "  dates = []\n",
        "  bad = []\n",
        "  for i, date in enumerate(df['date']):\n",
        "    try:\n",
        "      if date[0].isdigit():\n",
        "        f = formats[1]\n",
        "      elif len(date.split()[0].strip()) > 3:\n",
        "        f = formats[0]\n",
        "      else:\n",
        "        f = formats[2]\n",
        "      dates.append(datetime.strptime(date.strip(), f))\n",
        "    except:\n",
        "      bad.append([date, format, i])\n",
        "  print(bad)\n",
        "  df = df.drop([b[-1] for b in bad])\n",
        "  df['Date'] = dates\n",
        "  df['month'] = pd.DatetimeIndex(dates).month\n",
        "  df['year'] = pd.DatetimeIndex(dates).year\n",
        "  df['day'] = pd.DatetimeIndex(dates).day\n",
        "  return df"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 10,
      "metadata": {
        "id": "z36fR7Re1yBI",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "15c328bb-e988-4c53-a994-2288a4584405"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "[['https://100percentfedup.com/served-roy-moore-vietnamletter-veteran-sets-record-straight-honorable-decent-respectable-patriotic-commander-soldier/', <built-in function format>, 9358], ['https://100percentfedup.com/video-hillary-asked-about-trump-i-just-want-to-eat-some-pie/', <built-in function format>, 15507], ['https://100percentfedup.com/12-yr-old-black-conservative-whose-video-to-obama-went-viral-do-you-really-love-america-receives-death-threats-from-left/', <built-in function format>, 15508], ['https://fedup.wpengine.com/wp-content/uploads/2015/04/hillarystreetart.jpg', <built-in function format>, 15839], ['https://fedup.wpengine.com/wp-content/uploads/2015/04/entitled.jpg', <built-in function format>, 15840], ['https://fedup.wpengine.com/wp-content/uploads/2015/04/hillarystreetart.jpg', <built-in function format>, 17432], ['https://fedup.wpengine.com/wp-content/uploads/2015/04/entitled.jpg', <built-in function format>, 17433], ['MSNBC HOST Rudely Assumes Steel Worker Would Never Let His Son Follow in His Footsteps…He Couldn’t Be More Wrong [Video]', <built-in function format>, 18933], ['https://fedup.wpengine.com/wp-content/uploads/2015/04/hillarystreetart.jpg', <built-in function format>, 21869], ['https://fedup.wpengine.com/wp-content/uploads/2015/04/entitled.jpg', <built-in function format>, 21870]]\n",
            "[]\n"
          ]
        }
      ],
      "source": [
        "fake = clean_dates(fake)\n",
        "real = clean_dates(real)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 11,
      "metadata": {
        "id": "zJdyO4Si2zE8"
      },
      "outputs": [],
      "source": [
        "# Boolean indexing\n",
        "short_fake = fake[(fake.year == 2017) & (fake.month == 1)]\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 12,
      "metadata": {
        "id": "8WRFQwvlR0Tx",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 423
        },
        "outputId": "2a1acdd6-df82-4c4c-bc54-b57a68acf323"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "                                                   title  \\\n",
              "2749    Trump’s SCOTUS Pick Sided With Hobby Lobby Ag...   \n",
              "2750    It Took A Scathing Letter From Canada’s Prime...   \n",
              "2751    WATCH: Jake Tapper STUNNED Into Disbelief Lis...   \n",
              "2752    An Anonymous Group Just Revealed The Direct P...   \n",
              "2753    Trump Jr. Just ‘Liked’ Tweet Praising Mosque ...   \n",
              "...                                                  ...   \n",
              "23076  SOUR GRAPES? Whatever happened to the ‘smooth ...   \n",
              "23077  HACKING DEMOCRACY? CIA Accusing Russia of Doin...   \n",
              "23078                       Good News for Silver in 2017   \n",
              "23079             Gerald Celente: Top 10 Trends for 2017   \n",
              "23080  CNN’s Don Lemon: Is He an Alcoholic or Just a ...   \n",
              "\n",
              "                                                    text      subject  \\\n",
              "2749   On Tuesday, Donald Trump announced the identit...         News   \n",
              "2750   Fox News couldn t wait to try to spin the Queb...         News   \n",
              "2751   Sean Spicer is doing his level best to make en...         News   \n",
              "2752   Just after Donald Trump was sworn in, his admi...         News   \n",
              "2753   When it comes to how shameless the Trump famil...         News   \n",
              "...                                                  ...          ...   \n",
              "23076   Andrew Malcolm McClatchy News You better stop...  Middle-east   \n",
              "23077  Peter Certo Other WordsEven in an election yea...  Middle-east   \n",
              "23078   James Burgess Oil PricePrecious metals are an...  Middle-east   \n",
              "23079  What can we expect in 2017? Inflated markets, ...  Middle-east   \n",
              "23080  Day in and day out, CNN works hard to try and ...  Middle-east   \n",
              "\n",
              "                   date       Date  month  year  day  \n",
              "2749   January 31, 2017 2017-01-31      1  2017   31  \n",
              "2750   January 31, 2017 2017-01-31      1  2017   31  \n",
              "2751   January 31, 2017 2017-01-31      1  2017   31  \n",
              "2752   January 31, 2017 2017-01-31      1  2017   31  \n",
              "2753   January 31, 2017 2017-01-31      1  2017   31  \n",
              "...                 ...        ...    ...   ...  ...  \n",
              "23076   January 3, 2017 2017-01-03      1  2017    3  \n",
              "23077   January 3, 2017 2017-01-03      1  2017    3  \n",
              "23078   January 3, 2017 2017-01-03      1  2017    3  \n",
              "23079   January 2, 2017 2017-01-02      1  2017    2  \n",
              "23080   January 2, 2017 2017-01-02      1  2017    2  \n",
              "\n",
              "[1029 rows x 8 columns]"
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-035a5020-58f6-414d-99c2-0d5af6f9a059\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>title</th>\n",
              "      <th>text</th>\n",
              "      <th>subject</th>\n",
              "      <th>date</th>\n",
              "      <th>Date</th>\n",
              "      <th>month</th>\n",
              "      <th>year</th>\n",
              "      <th>day</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>2749</th>\n",
              "      <td>Trump’s SCOTUS Pick Sided With Hobby Lobby Ag...</td>\n",
              "      <td>On Tuesday, Donald Trump announced the identit...</td>\n",
              "      <td>News</td>\n",
              "      <td>January 31, 2017</td>\n",
              "      <td>2017-01-31</td>\n",
              "      <td>1</td>\n",
              "      <td>2017</td>\n",
              "      <td>31</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2750</th>\n",
              "      <td>It Took A Scathing Letter From Canada’s Prime...</td>\n",
              "      <td>Fox News couldn t wait to try to spin the Queb...</td>\n",
              "      <td>News</td>\n",
              "      <td>January 31, 2017</td>\n",
              "      <td>2017-01-31</td>\n",
              "      <td>1</td>\n",
              "      <td>2017</td>\n",
              "      <td>31</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2751</th>\n",
              "      <td>WATCH: Jake Tapper STUNNED Into Disbelief Lis...</td>\n",
              "      <td>Sean Spicer is doing his level best to make en...</td>\n",
              "      <td>News</td>\n",
              "      <td>January 31, 2017</td>\n",
              "      <td>2017-01-31</td>\n",
              "      <td>1</td>\n",
              "      <td>2017</td>\n",
              "      <td>31</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2752</th>\n",
              "      <td>An Anonymous Group Just Revealed The Direct P...</td>\n",
              "      <td>Just after Donald Trump was sworn in, his admi...</td>\n",
              "      <td>News</td>\n",
              "      <td>January 31, 2017</td>\n",
              "      <td>2017-01-31</td>\n",
              "      <td>1</td>\n",
              "      <td>2017</td>\n",
              "      <td>31</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2753</th>\n",
              "      <td>Trump Jr. Just ‘Liked’ Tweet Praising Mosque ...</td>\n",
              "      <td>When it comes to how shameless the Trump famil...</td>\n",
              "      <td>News</td>\n",
              "      <td>January 31, 2017</td>\n",
              "      <td>2017-01-31</td>\n",
              "      <td>1</td>\n",
              "      <td>2017</td>\n",
              "      <td>31</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>...</th>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>23076</th>\n",
              "      <td>SOUR GRAPES? Whatever happened to the ‘smooth ...</td>\n",
              "      <td>Andrew Malcolm McClatchy News You better stop...</td>\n",
              "      <td>Middle-east</td>\n",
              "      <td>January 3, 2017</td>\n",
              "      <td>2017-01-03</td>\n",
              "      <td>1</td>\n",
              "      <td>2017</td>\n",
              "      <td>3</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>23077</th>\n",
              "      <td>HACKING DEMOCRACY? CIA Accusing Russia of Doin...</td>\n",
              "      <td>Peter Certo Other WordsEven in an election yea...</td>\n",
              "      <td>Middle-east</td>\n",
              "      <td>January 3, 2017</td>\n",
              "      <td>2017-01-03</td>\n",
              "      <td>1</td>\n",
              "      <td>2017</td>\n",
              "      <td>3</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>23078</th>\n",
              "      <td>Good News for Silver in 2017</td>\n",
              "      <td>James Burgess Oil PricePrecious metals are an...</td>\n",
              "      <td>Middle-east</td>\n",
              "      <td>January 3, 2017</td>\n",
              "      <td>2017-01-03</td>\n",
              "      <td>1</td>\n",
              "      <td>2017</td>\n",
              "      <td>3</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>23079</th>\n",
              "      <td>Gerald Celente: Top 10 Trends for 2017</td>\n",
              "      <td>What can we expect in 2017? Inflated markets, ...</td>\n",
              "      <td>Middle-east</td>\n",
              "      <td>January 2, 2017</td>\n",
              "      <td>2017-01-02</td>\n",
              "      <td>1</td>\n",
              "      <td>2017</td>\n",
              "      <td>2</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>23080</th>\n",
              "      <td>CNN’s Don Lemon: Is He an Alcoholic or Just a ...</td>\n",
              "      <td>Day in and day out, CNN works hard to try and ...</td>\n",
              "      <td>Middle-east</td>\n",
              "      <td>January 2, 2017</td>\n",
              "      <td>2017-01-02</td>\n",
              "      <td>1</td>\n",
              "      <td>2017</td>\n",
              "      <td>2</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "<p>1029 rows × 8 columns</p>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-035a5020-58f6-414d-99c2-0d5af6f9a059')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-035a5020-58f6-414d-99c2-0d5af6f9a059 button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-035a5020-58f6-414d-99c2-0d5af6f9a059');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "<div id=\"df-7f1b38e1-e641-4d55-95f5-6964b28d7689\">\n",
              "  <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-7f1b38e1-e641-4d55-95f5-6964b28d7689')\"\n",
              "            title=\"Suggest charts\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "     width=\"24px\">\n",
              "    <g>\n",
              "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
              "    </g>\n",
              "</svg>\n",
              "  </button>\n",
              "\n",
              "<style>\n",
              "  .colab-df-quickchart {\n",
              "      --bg-color: #E8F0FE;\n",
              "      --fill-color: #1967D2;\n",
              "      --hover-bg-color: #E2EBFA;\n",
              "      --hover-fill-color: #174EA6;\n",
              "      --disabled-fill-color: #AAA;\n",
              "      --disabled-bg-color: #DDD;\n",
              "  }\n",
              "\n",
              "  [theme=dark] .colab-df-quickchart {\n",
              "      --bg-color: #3B4455;\n",
              "      --fill-color: #D2E3FC;\n",
              "      --hover-bg-color: #434B5C;\n",
              "      --hover-fill-color: #FFFFFF;\n",
              "      --disabled-bg-color: #3B4455;\n",
              "      --disabled-fill-color: #666;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart {\n",
              "    background-color: var(--bg-color);\n",
              "    border: none;\n",
              "    border-radius: 50%;\n",
              "    cursor: pointer;\n",
              "    display: none;\n",
              "    fill: var(--fill-color);\n",
              "    height: 32px;\n",
              "    padding: 0;\n",
              "    width: 32px;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart:hover {\n",
              "    background-color: var(--hover-bg-color);\n",
              "    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "    fill: var(--button-hover-fill-color);\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart-complete:disabled,\n",
              "  .colab-df-quickchart-complete:disabled:hover {\n",
              "    background-color: var(--disabled-bg-color);\n",
              "    fill: var(--disabled-fill-color);\n",
              "    box-shadow: none;\n",
              "  }\n",
              "\n",
              "  .colab-df-spinner {\n",
              "    border: 2px solid var(--fill-color);\n",
              "    border-color: transparent;\n",
              "    border-bottom-color: var(--fill-color);\n",
              "    animation:\n",
              "      spin 1s steps(1) infinite;\n",
              "  }\n",
              "\n",
              "  @keyframes spin {\n",
              "    0% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "      border-left-color: var(--fill-color);\n",
              "    }\n",
              "    20% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    30% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    40% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    60% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    80% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "    90% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "  }\n",
              "</style>\n",
              "\n",
              "  <script>\n",
              "    async function quickchart(key) {\n",
              "      const quickchartButtonEl =\n",
              "        document.querySelector('#' + key + ' button');\n",
              "      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n",
              "      quickchartButtonEl.classList.add('colab-df-spinner');\n",
              "      try {\n",
              "        const charts = await google.colab.kernel.invokeFunction(\n",
              "            'suggestCharts', [key], {});\n",
              "      } catch (error) {\n",
              "        console.error('Error during call to suggestCharts:', error);\n",
              "      }\n",
              "      quickchartButtonEl.classList.remove('colab-df-spinner');\n",
              "      quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
              "    }\n",
              "    (() => {\n",
              "      let quickchartButtonEl =\n",
              "        document.querySelector('#df-7f1b38e1-e641-4d55-95f5-6964b28d7689 button');\n",
              "      quickchartButtonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "    })();\n",
              "  </script>\n",
              "</div>\n",
              "\n",
              "  <div id=\"id_9dbe19ef-2f5e-42d3-b0b6-ebf0e6272679\">\n",
              "    <style>\n",
              "      .colab-df-generate {\n",
              "        background-color: #E8F0FE;\n",
              "        border: none;\n",
              "        border-radius: 50%;\n",
              "        cursor: pointer;\n",
              "        display: none;\n",
              "        fill: #1967D2;\n",
              "        height: 32px;\n",
              "        padding: 0 0 0 0;\n",
              "        width: 32px;\n",
              "      }\n",
              "\n",
              "      .colab-df-generate:hover {\n",
              "        background-color: #E2EBFA;\n",
              "        box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "        fill: #174EA6;\n",
              "      }\n",
              "\n",
              "      [theme=dark] .colab-df-generate {\n",
              "        background-color: #3B4455;\n",
              "        fill: #D2E3FC;\n",
              "      }\n",
              "\n",
              "      [theme=dark] .colab-df-generate:hover {\n",
              "        background-color: #434B5C;\n",
              "        box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "        filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "        fill: #FFFFFF;\n",
              "      }\n",
              "    </style>\n",
              "    <button class=\"colab-df-generate\" onclick=\"generateWithVariable('short_fake')\"\n",
              "            title=\"Generate code using this dataframe.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M7,19H8.4L18.45,9,17,7.55,7,17.6ZM5,21V16.75L18.45,3.32a2,2,0,0,1,2.83,0l1.4,1.43a1.91,1.91,0,0,1,.58,1.4,1.91,1.91,0,0,1-.58,1.4L9.25,21ZM18.45,9,17,7.55Zm-12,3A5.31,5.31,0,0,0,4.9,8.1,5.31,5.31,0,0,0,1,6.5,5.31,5.31,0,0,0,4.9,4.9,5.31,5.31,0,0,0,6.5,1,5.31,5.31,0,0,0,8.1,4.9,5.31,5.31,0,0,0,12,6.5,5.46,5.46,0,0,0,6.5,12Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "    <script>\n",
              "      (() => {\n",
              "      const buttonEl =\n",
              "        document.querySelector('#id_9dbe19ef-2f5e-42d3-b0b6-ebf0e6272679 button.colab-df-generate');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      buttonEl.onclick = () => {\n",
              "        google.colab.notebook.generateWithVariable('short_fake');\n",
              "      }\n",
              "      })();\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "variable_name": "short_fake",
              "summary": "{\n  \"name\": \"short_fake\",\n  \"rows\": 1029,\n  \"fields\": [\n    {\n      \"column\": \"title\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 803,\n        \"samples\": [\n          \" UNHINGED Trump Supporters Visit DC For Inauguration, Celebrate By SPITTING On Migrant Kids (TWEET)\",\n          \" Republicans Just Moved To Repeal Obamacare. They Never Expected Americans To Respond Like THIS (TWEETS)\",\n          \" Donald Trump Will Throw A Hissy Fit After Reading George Will\\u2019s Review Of His Inaugural Address\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"text\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 747,\n        \"samples\": [\n          \"Donald Trump claims he is writing his own Inaugural Address and posted an image to Twitter as  proof.  The Internet laughed out loud.Trump looks like he is not enjoying anything that requires thinking and the image is clearly staged to once again dupe his gullible supporters into believing that he is actually writing it himself. Trump also dubbed his Mar-a-Lago resort the  winter White House,  which also pissed people off.Writing my inaugural address at the Winter White House, Mar-a-Lago, three weeks ago. Looking forward to Friday. #Inauguration pic.twitter.com/S701FdTCQu  Donald J. Trump (@realDonaldTrump) January 18, 2017In reality, it s hard to believe Trump is capable of writing anything longer than 140 characters, so it s obvious that a speechwriter is doing all the work just like Trump hired a ghostwriter to pen his books.The image is so bad, in fact, that Twitter users couldn t help but mercilessly mock Trump.@realDonaldTrump I hope you ve been working on that vocabulary of yours.  Kris Sanchez (@KrisSanchez) January 18, 2017.@realDonaldTrump  Four Score and Seven Years ago, all the haters, losers and enemies (sadly of which there are many), were born fucked up!  pic.twitter.com/mpg9hkN6dQ  Kristina Wong   (@mskristinawong) January 18, 2017.@realDonaldTrump  ..I stand before you, ready to hit the  sleazebags  back, much better than seeing a psychiatrist (which I never have)..  pic.twitter.com/FsuBAvDr06  Kristina Wong   (@mskristinawong) January 18, 2017.@realDonaldTrump  to fight hoaxes such as Global warming invented by the Chinese, environmentally friendly light bulbs, and vaccines..  pic.twitter.com/mhpy8BBAtJ  Kristina Wong   (@mskristinawong) January 18, 2017.@realDonaldTrump Oh, honey, this looks terribly staged. Is that a magic marker?  Bess Kalb (@bessbell) January 18, 2017.@realDonaldTrump And, dear, I know you need a familiar bed, but the  winter White House  is the White House.  Bess Kalb (@bessbell) January 18, 2017@realDonaldTrump Who is translating it from the original Russian for you?  Neil Miller (@rejects) January 18, 2017.@realDonaldTrump plagiarize Obamas? pic.twitter.com/fTOVZcIGDn  Jordan Uhl (@JordanUhl) January 18, 2017@realDonaldTrump that is a blank piece of paper and you re holding a closed sharpie pic.twitter.com/ekCcH8eTXe  Jules Suzdaltsev (@jules_su) January 18, 2017I notice you re no longer hanging around live eagles pic.twitter.com/WH66BENVpV  Roland Scahill (@rolandscahill) January 18, 2017@realDonaldTrump Between the staging and the obviously blank paper, I m thinking #FakeNews, right?  Danielle Smith (@DanielleSmithTV) January 18, 2017@realDonaldTrump Will it start with  Four Russian Whores And Seven Hacks Ago  Tony Posnanski (@tonyposnanski) January 18, 2017@realDonaldTrump that s not how people write, Donald. You need to put the paper flat on the desk, and open the notepad.  Matt Haig (@matthaig1) January 18, 2017@realDonaldTrump wow looks great pic.twitter.com/kRxMCN4yLB  elisabeth (@esjesj) January 18, 2017Does anyone want to bet that Trump will fill his speech with many of the greatest words ever spoken by past presidents and take credit for them himself? Because he sure as hell isn t going to think of something original.Featured image via Twitter\",\n          \"Donald Trump may be decrying the documents dumped by Buzzfeed as  fake news,  but it seems to be  fake news  everyone can believe in. Ever since the memos compiled by former MI6 officer Christopher Steele were released to the public, The Donald has not exactly had an easy time of things, with every press conference and attempt to tweet quickly devolving into a screaming mess in which Trump throws himself into a panic screaming of all-caps typing the words  FAKE NEWS. But while Trump is taking the  nothing to see here  approach with the reports, at least four sources now back the information according to a report from BBC s Paul Wood earlier this week: The former MI6 agent is not the only source for the claim about Russian kompromat on the president-elect. Back in August, a retired spy told me he had been informed of its existence by  the head of an East European intelligence agency,  Wood says, explaining that he used an intermediary to check into things. A U.S. intelligence contact told him that there was  more than one  tape, and that video and audio of Trump s exploits in Moscow and St. Petersberg do exist.According to Israeli news publication Yedioth Ahronoth, their country s spies have been warned by the United States against sharing information with the Trump administration until they can determine whether he has been compromised by Putin:These fears, which began upon Trump s election, grew stronger following a meeting held recently between Israeli and American intelligence officials (the date of the meeting is not mentioned to protect the sources of the report). During the meeting, according to the Israelis who participated in it, their American colleagues voiced despair over Trump s election, as he often lashes out at the American intelligence community. The American officials also told the Israelis that the National Security Agency (NSA) had  highly credible information  that Russia s intelligence agencies, the FSB and GRU, were responsible for hacking the Democratic Party (DNC) servers during the elections and leaking sensitive information to WikiLeaks, which hurt Democratic presidential candidate Hillary Clinton.The American officials further added that they believed Russia President Vladimir Putin had  leverages of pressure  over Trump   but did not elaborate. They were apparently referring to what was published Wednesday about embarrassing information collected by the Russian intelligence in a bid to blackmail the president-elect.The Americans implied that their Israeli colleagues should  be careful  as of January 20, Trump s inauguration date, when transferring intelligence information to the White House and to the National Security Council (NSC), which is subject to the president. According to the Israelis who were present in the meeting, the Americans recommended that until it is made clear that Trump is not inappropriately connected to Russia and is not being extorted   Israel should avoid revealing sensitive sources to administration officials for fear the information would reach the Iranians.If Israel s secrets are indeed not kept confidential, this is a serious danger to the state s national security: Since the early 2000s, the cooperation between the Israel and US intelligence communities has been intensified. It was led by the head of the Israeli Military Intelligence Directorate (AMAN) at the time, Aharon Ze evi Farkash (who even received a citation from the NSA Chief General Michael Hayden), late Mossad chief Meir Dagan and his successor, Tamir Pardo, who served earlier as the commander of one of the secret operational units that cooperated with the Americans. The Israelis who attended the meeting said that the Americans advised them not to expose any sensitive sources to members of the Trump administration, lest that information reach Iranian hands, until it becomes clear that Trump does not have a compromised relationship with Russia and is not vulnerable to extortion,  the report states.At this point, we have no choice but to assume that the level of concern shown by intelligence agencies worldwide is a strong indicator that we should be concerned too. Donald Trump is sworn into office on Friday. May one of the gods help us all.featured image via Getty Images (Chip Somodevilla)\",\n          \"When Donald Trump was a running for office, he and his conservative followers repeatedly called for  locking up  Hillary Clinton for handling government work with a private server while secretary of state. Hypocrisy, thy name is GOP. Newsweek is reporting that senior Trump administration staffers have active accounts on a Republican National Committee email system. Kellyanne Conway, Jared Kushner, Sean Spicer and Steve Bannon are among those who are using the system (rnchq.org)   the same one the George W. Bush administration was accused of using in order to evade transparency rules after claiming to have  lost  22 million emails.What they re doing is not illegal but it is ironic.But after then-candidate Donald Trump and the Republicans repeatedly called for  locking up  Hillary Clinton for handling government work with a private server while secretary of state, the new White House staff risks repeating the same mistake that dogged the Democrat s presidential campaign. They also face a security challenge: The RNC email system, according to U.S. intelligence, was hacked during the 2016 race.  They better be careful after making such a huge ruckus over the private email over at the State Department,  says former Bush administration lawyer Richard Painter.And by the way, Hillary Clinton s server use did not violate the law. Still yet, last July, an informal adviser to Donald Trump s presidential campaign called for Hillary Clinton s execution. Al Baldasaro said that Clinton  should be put in the firing line and shot for treason  because of her use of private email server.It s not known yet how the Trump staffers are using the RNC email addresses, however, if they are actively using them then they are subject to the  Disclosure Requirement For Official Business Conducted Using Electronic Messaging Accounts. If White House staffers have already used the RNC emails system for White House work, they must copy or forward those communications into the government system within 20 days.But wait, there s more!The New York Times is reporting that Trump is still using his  old, unsecured Android phone  to take calls and tweet despite  protests from some of his aides. Last night when Trump tweeted that he would send in the Feds  to Chicago, he was using his Android. As for the White House phones, Trump said they are  the world s most secure system,  adding,  The words just explode in the air. Notably, this is the same RNC server that multiple intelligence agencies and officials claim was hacked during the 2016 race.Photo by Spencer Platt/Getty Images.\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"subject\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 6,\n        \"samples\": [\n          \"News\",\n          \"politics\",\n          \"Middle-east\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"date\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 62,\n        \"samples\": [\n          \"Jan 12, 2017\",\n          \"Jan 6, 2017\",\n          \"January 31, 2017\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"Date\",\n      \"properties\": {\n        \"dtype\": \"date\",\n        \"min\": \"2017-01-01 00:00:00\",\n        \"max\": \"2017-01-31 00:00:00\",\n        \"num_unique_values\": 31,\n        \"samples\": [\n          \"2017-01-04 00:00:00\",\n          \"2017-01-16 00:00:00\",\n          \"2017-01-08 00:00:00\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"month\",\n      \"properties\": {\n        \"dtype\": \"int32\",\n        \"num_unique_values\": 1,\n        \"samples\": [\n          1\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"year\",\n      \"properties\": {\n        \"dtype\": \"int32\",\n        \"num_unique_values\": 1,\n        \"samples\": [\n          2017\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"day\",\n      \"properties\": {\n        \"dtype\": \"int32\",\n        \"num_unique_values\": 31,\n        \"samples\": [\n          4\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
            }
          },
          "metadata": {},
          "execution_count": 12
        }
      ],
      "source": [
        "short_fake"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 13,
      "metadata": {
        "id": "pB_A1cC33BFt"
      },
      "outputs": [],
      "source": [
        "short_real = real[(real.year == 2017) & (real.month == 1)]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 15,
      "metadata": {
        "id": "i9I_9EZy7Xxz",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "9589bfe8-a3ad-47c7-d92f-b42ddd7ef367"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "CPU times: user 11 s, sys: 40.1 ms, total: 11 s\n",
            "Wall time: 11.1 s\n"
          ]
        }
      ],
      "source": [
        "%time fake_docs = short_fake['title'].apply(nlp)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 16,
      "metadata": {
        "id": "kBueyWBkeMvZ",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "a6e745d8-c755-4da0-f272-6565b1099380"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "CPU times: user 7.45 s, sys: 36.3 ms, total: 7.49 s\n",
            "Wall time: 8.47 s\n"
          ]
        }
      ],
      "source": [
        "%time real_docs = short_real['title'].apply(nlp)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 17,
      "metadata": {
        "id": "lYiD8-1a7f43",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "1b1c9eea-8629-4716-db99-ed915b50ce5c"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "[' ', 'Trump', '’s', 'SCOTUS', 'Pick', 'Sided', 'With', 'Hobby', 'Lobby', 'Against', 'Women', ',', 'Thinks', 'Christianity', 'Trumps', '‘', 'Secular', 'Courts', '’']\n",
            "[' ', 'Trump', '’s', 'SCOTUS', 'Pick', 'side', 'with', 'Hobby', 'Lobby', 'against', 'Women', ',', 'think', 'Christianity', 'Trumps', \"'\", 'Secular', 'Courts', \"'\"]\n",
            "['SPACE', 'PROPN', 'PART', 'PROPN', 'PROPN', 'VERB', 'ADP', 'PROPN', 'PROPN', 'ADP', 'PROPN', 'PUNCT', 'VERB', 'PROPN', 'PROPN', 'PUNCT', 'PROPN', 'PROPN', 'PUNCT']\n",
            "['_SP', 'NNP', 'POS', 'NNP', 'NNP', 'VBD', 'IN', 'NNP', 'NNP', 'IN', 'NNPS', ',', 'VBZ', 'NNP', 'NNP', '``', 'NNP', 'NNPS', \"''\"]\n",
            "verb, past participle\n",
            "[ Trump’s SCOTUS Pick, Hobby Lobby, Women, Christianity Trumps, ‘Secular Courts]\n",
            "22\n",
            "1\n",
            "18\n",
            "1\n"
          ]
        }
      ],
      "source": [
        "doc = fake_docs.iloc[0]\n",
        "print([t.text for t in doc]) # tokenized text of the document\n",
        "print([t.lemma_ for t in doc]) # lemmatized text of the document\n",
        "print([t.pos_ for t in doc]) # Part of speech for all tokens in the document\n",
        "print([t.tag_ for t in doc]) # Fine-grained part of speech for all tokens in the document\n",
        "print(spacy.explain('VBN')) # making sense out of the spacy acronyms\n",
        "print([c for c in doc.noun_chunks]) # All the noun phrases in the document\n",
        "print(len([doc for doc in fake_docs if round(doc._.blob.polarity, 2)  < -.9])) # Sentiment analysis of the document. <0 means negative >0 means positive.\n",
        "print(len([doc for doc in real_docs if round(doc._.blob.polarity, 2) < -.9]))\n",
        "\n",
        "print(len([doc for doc in fake_docs if round(doc._.blob.polarity, 2)  > .9])) # Sentiment analysis of the document. <0 means negative >0 means positive.\n",
        "print(len([doc for doc in real_docs if round(doc._.blob.polarity, 2) > .9]))\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 18,
      "metadata": {
        "id": "MtxYdGcWoNic",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "c31250ea-f97d-4c58-829e-4c189218480e"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "[The big furry cat, the little brown mouse]\n"
          ]
        }
      ],
      "source": [
        "text = 'The big furry cat ate the little brown mouse.'\n",
        "doc = nlp(text)\n",
        "print([n for n in doc.noun_chunks])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 19,
      "metadata": {
        "id": "i6wGAhvM86em",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "e825d8ec-84a0-4756-9c9f-912aea9a0b7c"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Counter({'Walls': 1})\n",
            "Counter({'Women': 1, 'Courts': 1})\n",
            "[('Republicans', 29), ('Americans', 18), ('Democrats', 16), ('Rights', 11), ('Women', 10)]\n",
            "[('Republicans', 29), ('Americans', 18), ('Democrats', 16), ('Rights', 11), ('Women', 10), ('Supporters', 10), ('Dems', 8), ('Refugees', 8), ('Muslims', 7), ('Liberals', 7), ('Streets', 6), ('Blacks', 5), ('Orders', 4), ('Workers', 4), ('Delivers', 4), ('Regulations', 4), ('Conservatives', 4), ('Attacks', 4), ('Hits', 4), ('Sessions', 4)]\n"
          ]
        }
      ],
      "source": [
        "from collections import Counter\n",
        "\n",
        "# c = Counter(['a', 'a', 'b', 'b', 'b'])\n",
        "# print(c)\n",
        "# d = Counter(['a', 'b', 'd', 'd'])\n",
        "# d.most_common(1)\n",
        "\n",
        "def count_nouns(doc):\n",
        "  nouns = [\n",
        "      token.lemma_ for token in doc if\n",
        "             token.tag_ == \"NNPS\"]\n",
        "  word_freq = Counter(nouns)\n",
        "  return word_freq\n",
        "\n",
        "# counts = real_docs.apply(count_nouns).sum().most_common(20)\n",
        "# # print(counts)\n",
        "counter_series = fake_docs.apply(count_nouns)\n",
        "print(counter_series.iloc[5])\n",
        "print(counter_series.iloc[0])\n",
        "print(counter_series.sum().most_common(5))\n",
        "print(fake_docs.apply(count_nouns).sum().most_common(20))"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3 (ipykernel)",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.12.3"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}