{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TitleLocationCompanySalaryDescription
0Data ScientistBromleyOSTCNaNCompany Description\\nIn just over 15 years, OS...
1Biostatistician / Data Scientist - Innovative ...LondonWarman O'BrienNaNBiostatistician / Data Scientist - London\\nI a...
2Data ScientistLondonctrlio LtdNaNctrlio helps motor insurers win customers more...
3Junior Data ScientistLondonNovafuturNaNThis is an opportunity to become a Data Scient...
4Data Scientist LeadLondonJPMorgan Chase Bank, N.A.NaNPosition Overview & Responsibilities:\\nThis is...
..................
7838Senior Clinical Operations ManagerLondonCK GroupNaNSam Whyley-Smith at CK Clinical is recruiting ...
7839Global Clinical Operations Leader (Senior Mana...LondonCK GroupNaNCK Clinical are recruiting for a Global Clinic...
7840Actuarial AnalystLondon SW1A 2AHAgeas Insurance LimitedNaNJob Title: Actuarial Analyst\\nSalary: up to £3...
7841Pricing AnalystReigate RH2 9AQAgeas Insurance LimitedNaNActuarial Pricing Analyst:\\nTesco Underwriting...
7842Management AccountantLondon TW8 9ESSafetykleenNaNWho are Safetykleen?\\nWe are the leading provi...
\n", "

7843 rows × 5 columns

\n", "
" ], "text/plain": [ " Title Location \\\n", "0 Data Scientist Bromley \n", "1 Biostatistician / Data Scientist - Innovative ... London \n", "2 Data Scientist London \n", "3 Junior Data Scientist London \n", "4 Data Scientist Lead London \n", "... ... ... \n", "7838 Senior Clinical Operations Manager London \n", "7839 Global Clinical Operations Leader (Senior Mana... London \n", "7840 Actuarial Analyst London SW1A 2AH \n", "7841 Pricing Analyst Reigate RH2 9AQ \n", "7842 Management Accountant London TW8 9ES \n", "\n", " Company Salary \\\n", "0 OSTC NaN \n", "1 Warman O'Brien NaN \n", "2 ctrlio Ltd NaN \n", "3 Novafutur NaN \n", "4 JPMorgan Chase Bank, N.A. NaN \n", "... ... ... \n", "7838 CK Group NaN \n", "7839 CK Group NaN \n", "7840 Ageas Insurance Limited NaN \n", "7841 Ageas Insurance Limited NaN \n", "7842 Safetykleen NaN \n", "\n", " Description \n", "0 Company Description\\nIn just over 15 years, OS... \n", "1 Biostatistician / Data Scientist - London\\nI a... \n", "2 ctrlio helps motor insurers win customers more... \n", "3 This is an opportunity to become a Data Scient... \n", "4 Position Overview & Responsibilities:\\nThis is... \n", "... ... \n", "7838 Sam Whyley-Smith at CK Clinical is recruiting ... \n", "7839 CK Clinical are recruiting for a Global Clinic... \n", "7840 Job Title: Actuarial Analyst\\nSalary: up to £3... \n", "7841 Actuarial Pricing Analyst:\\nTesco Underwriting... \n", "7842 Who are Safetykleen?\\nWe are the leading provi... \n", "\n", "[7843 rows x 5 columns]" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "\n", "df=pd.read_csv('data/df_broad.csv')\n", "df" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TitleLocationCompanySalaryDescription
0Data ScientistBromleyOSTCNaNCompany Description\\nIn just over 15 years, OS...
1Biostatistician / Data Scientist - Innovative ...LondonWarman O'BrienNaNBiostatistician / Data Scientist - London\\nI a...
2Data ScientistLondonctrlio LtdNaNctrlio helps motor insurers win customers more...
3Junior Data ScientistLondonNovafuturNaNThis is an opportunity to become a Data Scient...
4Data Scientist LeadLondonJPMorgan Chase Bank, N.A.NaNPosition Overview & Responsibilities:\\nThis is...
..................
2274Sr. Clinical Trial PhysicianUxbridgeBristol-Myers SquibbNaNAt Bristol Myers Squibb, we are inspired by a ...
2275Senior Medical Manager – OncologyLondonOnly MedicsNaNReference Number: JO-2003-448184\\nSenior Medic...
2276Azure Data ArchitectLondonVenturiNaNData Architect (Azure Data Architect ( Azure /...
2277Senior Clinical Operations ManagerLondonCK GroupNaNSam Whyley-Smith at CK Clinical is recruiting ...
2278Global Clinical Operations Leader (Senior Mana...LondonCK GroupNaNCK Clinical are recruiting for a Global Clinic...
\n", "

2279 rows × 5 columns

\n", "
" ], "text/plain": [ " Title Location \\\n", "0 Data Scientist Bromley \n", "1 Biostatistician / Data Scientist - Innovative ... London \n", "2 Data Scientist London \n", "3 Junior Data Scientist London \n", "4 Data Scientist Lead London \n", "... ... ... \n", "2274 Sr. Clinical Trial Physician Uxbridge \n", "2275 Senior Medical Manager – Oncology London \n", "2276 Azure Data Architect London \n", "2277 Senior Clinical Operations Manager London \n", "2278 Global Clinical Operations Leader (Senior Mana... London \n", "\n", " Company Salary \\\n", "0 OSTC NaN \n", "1 Warman O'Brien NaN \n", "2 ctrlio Ltd NaN \n", "3 Novafutur NaN \n", "4 JPMorgan Chase Bank, N.A. NaN \n", "... ... ... \n", "2274 Bristol-Myers Squibb NaN \n", "2275 Only Medics NaN \n", "2276 Venturi NaN \n", "2277 CK Group NaN \n", "2278 CK Group NaN \n", "\n", " Description \n", "0 Company Description\\nIn just over 15 years, OS... \n", "1 Biostatistician / Data Scientist - London\\nI a... \n", "2 ctrlio helps motor insurers win customers more... \n", "3 This is an opportunity to become a Data Scient... \n", "4 Position Overview & Responsibilities:\\nThis is... \n", "... ... \n", "2274 At Bristol Myers Squibb, we are inspired by a ... \n", "2275 Reference Number: JO-2003-448184\\nSenior Medic... \n", "2276 Data Architect (Azure Data Architect ( Azure /... \n", "2277 Sam Whyley-Smith at CK Clinical is recruiting ... \n", "2278 CK Clinical are recruiting for a Global Clinic... \n", "\n", "[2279 rows x 5 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = df.drop_duplicates(keep='first').reset_index(drop=True)\n", "df" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TitleLocationCompanySalaryDescription
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [Title, Location, Company, Salary, Description]\n", "Index: []" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[~np.isnan(df.Salary)]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TitleCompanyDescription
0Data ScientistOSTCCompany Description\\nIn just over 15 years, OS...
1Biostatistician / Data Scientist - Innovative ...Warman O'BrienBiostatistician / Data Scientist - London\\nI a...
2Data Scientistctrlio Ltdctrlio helps motor insurers win customers more...
3Junior Data ScientistNovafuturThis is an opportunity to become a Data Scient...
4Data Scientist LeadJPMorgan Chase Bank, N.A.Position Overview & Responsibilities:\\nThis is...
............
2274Sr. Clinical Trial PhysicianBristol-Myers SquibbAt Bristol Myers Squibb, we are inspired by a ...
2275Senior Medical Manager – OncologyOnly MedicsReference Number: JO-2003-448184\\nSenior Medic...
2276Azure Data ArchitectVenturiData Architect (Azure Data Architect ( Azure /...
2277Senior Clinical Operations ManagerCK GroupSam Whyley-Smith at CK Clinical is recruiting ...
2278Global Clinical Operations Leader (Senior Mana...CK GroupCK Clinical are recruiting for a Global Clinic...
\n", "

2279 rows × 3 columns

\n", "
" ], "text/plain": [ " Title \\\n", "0 Data Scientist \n", "1 Biostatistician / Data Scientist - Innovative ... \n", "2 Data Scientist \n", "3 Junior Data Scientist \n", "4 Data Scientist Lead \n", "... ... \n", "2274 Sr. Clinical Trial Physician \n", "2275 Senior Medical Manager – Oncology \n", "2276 Azure Data Architect \n", "2277 Senior Clinical Operations Manager \n", "2278 Global Clinical Operations Leader (Senior Mana... \n", "\n", " Company \\\n", "0 OSTC \n", "1 Warman O'Brien \n", "2 ctrlio Ltd \n", "3 Novafutur \n", "4 JPMorgan Chase Bank, N.A. \n", "... ... \n", "2274 Bristol-Myers Squibb \n", "2275 Only Medics \n", "2276 Venturi \n", "2277 CK Group \n", "2278 CK Group \n", "\n", " Description \n", "0 Company Description\\nIn just over 15 years, OS... \n", "1 Biostatistician / Data Scientist - London\\nI a... \n", "2 ctrlio helps motor insurers win customers more... \n", "3 This is an opportunity to become a Data Scient... \n", "4 Position Overview & Responsibilities:\\nThis is... \n", "... ... \n", "2274 At Bristol Myers Squibb, we are inspired by a ... \n", "2275 Reference Number: JO-2003-448184\\nSenior Medic... \n", "2276 Data Architect (Azure Data Architect ( Azure /... \n", "2277 Sam Whyley-Smith at CK Clinical is recruiting ... \n", "2278 CK Clinical are recruiting for a Global Clinic... \n", "\n", "[2279 rows x 3 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Remove Salary column\n", "df = df.iloc[:,[0,2,4]]\n", "df" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "lead_search = ['Lead','Senior','Sr.']\n", "lead = df[df.Title.str.contains('|'.join(lead_search))]\n", "not_lead = df[~df.Title.str.contains('|'.join(lead_search))]\n", "lead.reset_index(drop=True,inplace=True)\n", "lead.to_csv('processed data/lead.csv',index=False)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "not_lead.reset_index(drop=True,inplace=True)\n", "not_lead.to_csv('processed data/not_lead.csv',index=False)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "junior_search = ['Junior','Jr','Graduate']\n", "junior = not_lead[not_lead.Title.str.contains('|'.join(junior_search))]\n", "junior.reset_index(drop=True,inplace=True)\n", "junior.to_csv('processed data/junior.csv',index=False)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "data_scientist = df[df.Title.str.contains('Data Scientist')]\n", "data_scientist.to_csv('processed data/data_scientist.csv',index=False)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "ml_search = ['Machine','Learning','Machine Learning']\n", "ml = not_lead[not_lead.Title.str.contains('|'.join(ml_search))]\n", "ml.reset_index(drop=True,inplace=True)\n", "ml.to_csv('processed data/ml.csv',index=False)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "str" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_desc=data_scientist.Description.str.cat(sep=';')\n", "type(all_desc)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 2 }