import pandas as pd

doc = []
with open('dates.txt') as file:
    for line in file:
        doc.append(line)

df = pd.Series(doc)
df.head(10)

0         03/25/93 Total time of visit (in minutes):\n
1                       6/18/85 Primary Care Doctor:\n
2    sshe plans to move as of 7/8/71 In-Home Servic...
3                7 on 9/27/75 Audit C Score Current:\n
4    2/6/96 sleep studyPain Treatment Pain Level (N...
5                    .Per 7/06/79 Movement D/O note:\n
6    4, 5/18/78 Patient's thoughts about current su...
7    10/24/89 CPT Code: 90801 - Psychiatric Diagnos...
8                         3/7/86 SOS-10 Total Score:\n
9             (4/10/71)Score-1Audit C Score Current:\n
dtype: object


              
                import numpy as np
import re

# Your code here
#     Testing Data
#     df = ["•04/20/2009;", "04/20/09;", "4/20/09;", "4/3/09;",
#     "•Mar-20-2009;", "Mar 20, 2009;", "March 20, 2009;", "Mar. 20, 2009;", "Mar 20 2009;", "October 14 1974",
#     "•20 Mar 2009;", "20 March 2009;", "20 Mar. 2009;", "20 March, 2009","2June, 1999",
#     "•Mar 20th, 2009;", "Mar 21st, 2009;", "Mar 22nd, 2009",
#     "•Feb 2009;", "Sep 2009;", "Oct 2010",
#     "•6/2008;", "12/2009",
#     "•2009;", "2010"]
    
# df = ["•04/20/2009;", "04/20/09;", "4/20/09;", "4/3/09;",
pattern1 = r'(0?[1-9]|1[0-2])[\/\-](0?[1-9]|[12]\d|30|31)[\/\-](\d{4}|\d{2})'
df1 = df.str.extractall(pattern1)
df1.columns = ["month", "day", "year"]
df1 = df1.reset_index()
#df1

#"•Mar-20-2009;", "Mar 20, 2009;", "March 20, 2009;", "Mar. 20, 2009;", "Mar 20 2009;",
#October 14 1974
#"•Mar 20th, 2009;", "Mar 21st, 2009;", "Mar 22nd, 2009",
#pattern2 = r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z\.]*[ -](\d{1,2})[a-z]{0,2},[ -](\d{4})'
pattern2 = r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z\.]*[ -](\d{1,2})[a-z\.\,]*[ -](\d{4})'
df2=df.str.extractall(pattern2)
df2.columns = ["month", "day", "year"]
df2 = df2.reset_index()
#df2

#"•20 Mar 2009;", "20 March 2009;", "20 Mar. 2009;", "20 March, 2009","2June, 1999",
# "•Feb 2009;", "Sep 2009;", "Oct 2010",
pattern3 = r'(\d{1,2})?[ -]?(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z\.\,]*[ -](\d{4})'
df3=df.str.extractall(pattern3)
df3.columns = ["day", "month", "year"]
df3 = df3.reset_index()
#df3


# "•6/2008;", "12/2009",
pattern4 = r'(\d{1,2})[/](\d{4})'
df4 = df.str.extractall(pattern4)
df4.insert(0, column='day', value=np.nan)
df4.columns = ["day" , "month", "year"]
df4 = df4.reset_index()
#df4

## "•2009;", "2010"
pattern5 = r'(\d{4})'
df5 = df.str.extractall(pattern5)
df5.insert(0, column='day', value=np.nan)
df5.insert(1, column='month', value=np.nan)
df5.columns = ["month", "day", "year"]
df5 = df5=df5.reset_index()
#df5

output = df1.append(df2[~df2.level_0.isin(df1.level_0)])
#output.shape
output = output.append(df3[~df3.level_0.isin(output.level_0)])
#output.shape
output = output.append(df4[~df4.level_0.isin(output.level_0)])
#output.shape
output = output.append(df5[~df5.level_0.isin(output.level_0)])
#output.shape

output = pd.DataFrame(output,columns = ["level_0", "match", "day", "month","year"])
output.year = np.where(output.year.apply(len)==2, "19"+output.year, output.year)
output = output.fillna("1")

month_replace ={
        'Jan' : 1,
        'Feb' : 2,
        'Mar' : 3,
        'Apr' : 4,
        'May' : 5,
        'Jun' : 6,
        'Jul' : 7,
        'Aug' : 8,
        'Sep' : 9, 
        'Oct' : 10,
        'Nov' : 11,
        'Dec' : 12
}

output.month = output.month.replace(month_replace)

output.day = output.day.astype(int)
output.month = output.month.astype(int)
output.year = output.year.astype(int)

output["date"] = pd.to_datetime(output.loc[:,["year", "month", "day"]])

output = output.sort_values(["date", "level_0"]).reset_index(drop=True)
#output.info()
#return_value = pd.Series(output.level_0, name="index")

C:\Users\Don\AppData\Local\Temp\ipykernel_27152\3236873118.py:57: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  output = df1.append(df2[~df2.level_0.isin(df1.level_0)])
C:\Users\Don\AppData\Local\Temp\ipykernel_27152\3236873118.py:59: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  output = output.append(df3[~df3.level_0.isin(output.level_0)])
C:\Users\Don\AppData\Local\Temp\ipykernel_27152\3236873118.py:61: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  output = output.append(df4[~df4.level_0.isin(output.level_0)])
C:\Users\Don\AppData\Local\Temp\ipykernel_27152\3236873118.py:63: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  output = output.append(df5[~df5.level_0.isin(output.level_0)])


              
                def date_sorter():
    
    # Your code here
    import numpy as np
    import re

    # Your code here
#     Testing Data
#     df = ["•04/20/2009;", "04/20/09;", "4/20/09;", "4/3/09;",
#     "•Mar-20-2009;", "Mar 20, 2009;", "March 20, 2009;", "Mar. 20, 2009;", "Mar 20 2009;", "October 14 1974",
#     "•20 Mar 2009;", "20 March 2009;", "20 Mar. 2009;", "20 March, 2009","2June, 1999",
#     "•Mar 20th, 2009;", "Mar 21st, 2009;", "Mar 22nd, 2009",
#     "•Feb 2009;", "Sep 2009;", "Oct 2010",
#     "•6/2008;", "12/2009",
#     "•2009;", "2010"]
        
    # df = ["•04/20/2009;", "04/20/09;", "4/20/09;", "4/3/09;",
    pattern1 = r'(0?[1-9]|1[0-2])[\/\-](0?[1-9]|[12]\d|30|31)[\/\-](\d{4}|\d{2})'
    df1 = df.str.extractall(pattern1)
    df1.columns = ["month", "day", "year"]
    df1 = df1.reset_index()
    #df1
    
    #"•Mar-20-2009;", "Mar 20, 2009;", "March 20, 2009;", "Mar. 20, 2009;", "Mar 20 2009;",
    #October 14 1974
    #"•Mar 20th, 2009;", "Mar 21st, 2009;", "Mar 22nd, 2009",
    #pattern2 = r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z\.]*[ -](\d{1,2})[a-z]{0,2},[ -](\d{4})'
    pattern2 = r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z\.]*[ -](\d{1,2})[a-z\.\,]*[ -](\d{4})'
    df2=df.str.extractall(pattern2)
    df2.columns = ["month", "day", "year"]
    df2 = df2.reset_index()
    #df2
    
    #"•20 Mar 2009;", "20 March 2009;", "20 Mar. 2009;", "20 March, 2009","2June, 1999",
    # "•Feb 2009;", "Sep 2009;", "Oct 2010",
    pattern3 = r'(\d{1,2})?[ -]?(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z\.\,]*[ -](\d{4})'
    df3=df.str.extractall(pattern3)
    df3.columns = ["day", "month", "year"]
    df3 = df3.reset_index()
    #df3
    
    
    # "•6/2008;", "12/2009",
    pattern4 = r'(\d{1,2})[/](\d{4})'
    df4 = df.str.extractall(pattern4)
    df4.insert(0, column='day', value=np.nan)
    df4.columns = ["day" , "month", "year"]
    df4 = df4.reset_index()
    #df4
    
    ## "•2009;", "2010"
    pattern5 = r'(\d{4})'
    df5 = df.str.extractall(pattern5)
    df5.insert(0, column='day', value=np.nan)
    df5.insert(1, column='month', value=np.nan)
    df5.columns = ["month", "day", "year"]
    df5 = df5=df5.reset_index()
    #df5
    
    output = df1.append(df2[~df2.level_0.isin(df1.level_0)])
    #output.shape
    output = output.append(df3[~df3.level_0.isin(output.level_0)])
    #output.shape
    output = output.append(df4[~df4.level_0.isin(output.level_0)])
    #output.shape
    output = output.append(df5[~df5.level_0.isin(output.level_0)])
    #output.shape
    
    output = pd.DataFrame(output,columns = ["level_0", "match", "day", "month","year"])
    output.year = np.where(output.year.apply(len)==2, "19"+output.year, output.year)
    output = output.fillna("1")
    
    month_replace ={
            'Jan' : 1,
            'Feb' : 2,
            'Mar' : 3,
            'Apr' : 4,
            'May' : 5,
            'Jun' : 6,
            'Jul' : 7,
            'Aug' : 8,
            'Sep' : 9, 
            'Oct' : 10,
            'Nov' : 11,
            'Dec' : 12
    }
    
    output.month = output.month.replace(month_replace)
    
    output.day = output.day.astype(int)
    output.month = output.month.astype(int)
    output.year = output.year.astype(int)
    
    output["date"] = pd.to_datetime(output.loc[:,["year", "month", "day"]])
    
    output = output.sort_values(["date", "level_0"]).reset_index(drop=True)
    #output.info()
    #return_value = pd.Series(output.level_0, name="index")
    
    return output.level_0#return_value # Your answer here


              
                date_sorter()

C:\Users\Don\AppData\Local\Temp\ipykernel_27152\466805847.py:60: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  output = df1.append(df2[~df2.level_0.isin(df1.level_0)])
C:\Users\Don\AppData\Local\Temp\ipykernel_27152\466805847.py:62: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  output = output.append(df3[~df3.level_0.isin(output.level_0)])
C:\Users\Don\AppData\Local\Temp\ipykernel_27152\466805847.py:64: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  output = output.append(df4[~df4.level_0.isin(output.level_0)])
C:\Users\Don\AppData\Local\Temp\ipykernel_27152\466805847.py:66: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  output = output.append(df5[~df5.level_0.isin(output.level_0)])

0        9
1       84
2        2
3       53
4       28
      ... 
495    427
496    141
497    186
498    161
499    413
Name: level_0, Length: 500, dtype: int64

Assignment 1¶